PriceForecast/aa copy.py

55 lines
1.7 KiB
Python
Raw Normal View History

2024-11-01 16:38:21 +08:00
# 统计特征频度
# 读取文件
import pandas as pd
df = pd.read_csv("D:\code\huarongqiming\碳酸锂合并数据.csv",encoding='gbk')
df['ds'] = pd.to_datetime(df['ds'])
# 按ds正序排序重置索引
df = df.sort_values(by='ds', ascending=True).reset_index(drop=True)
# 统计特征频度
# 每列随机抽取6个值计算出5个时间间隔统计每个时间间隔的频度
columns = df.columns.to_list()
columns.remove('ds')
count_dict = {}
for column in columns:
# 获取每列时间间隔
values = df[[column,'ds']]
values.dropna(inplace=True,axis=0)
values=values.reset_index(drop=True)
# 抽取10个值
value = values.sample(10)
index = value.index
next_index = index + 1
count = []
for i,j in zip(index, next_index):
#通过索引计算日期差
try:
count.append((values.loc[j,'ds'] - values.loc[i,'ds']).days)
except:
pass
# 把31 换成 30
count = [30 if i == 31 else i for i in count]
# 保留count中出现次数最多的数
count = max(set(count), key=count.count)
# 存储到字典中
count_dict[column] = count
df = pd.DataFrame(count_dict,index=['count']).T
pindu_dfs = pd.DataFrame()
# 根据count分组
# 输出特征频度统计
pindudict = {'1':'日度','7':'周度','30':'月度','90':'季度','180':'半年度','365':'年度'}
for i in df.groupby('count'):
# 获取 i[1] 的索引值
index = i[1].index
pindu_df = pd.DataFrame()
pindu_df[pindudict[str(i[0])]+f'({len(i[1])})'] = index
# 合并到pindu_dfs
pindu_dfs = pd.concat([pindu_dfs,pindu_df],axis=1)
# nan替换为 ' '
pindu_dfs = pindu_dfs.fillna('')
pindu_dfs.to_csv('D:\code\huarongqiming\pindu.csv',index=False)
print(pindu_dfs)
print('*'*200)