PriceForecast/aa copy.py
2024-11-01 16:38:21 +08:00

55 lines
1.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 统计特征频度
# 读取文件
import pandas as pd
df = pd.read_csv("D:\code\huarongqiming\碳酸锂合并数据.csv",encoding='gbk')
df['ds'] = pd.to_datetime(df['ds'])
# 按ds正序排序重置索引
df = df.sort_values(by='ds', ascending=True).reset_index(drop=True)
# 统计特征频度
# 每列随机抽取6个值计算出5个时间间隔统计每个时间间隔的频度
columns = df.columns.to_list()
columns.remove('ds')
count_dict = {}
for column in columns:
# 获取每列时间间隔
values = df[[column,'ds']]
values.dropna(inplace=True,axis=0)
values=values.reset_index(drop=True)
# 抽取10个值
value = values.sample(10)
index = value.index
next_index = index + 1
count = []
for i,j in zip(index, next_index):
#通过索引计算日期差
try:
count.append((values.loc[j,'ds'] - values.loc[i,'ds']).days)
except:
pass
# 把31 换成 30
count = [30 if i == 31 else i for i in count]
# 保留count中出现次数最多的数
count = max(set(count), key=count.count)
# 存储到字典中
count_dict[column] = count
df = pd.DataFrame(count_dict,index=['count']).T
pindu_dfs = pd.DataFrame()
# 根据count分组
# 输出特征频度统计
pindudict = {'1':'日度','7':'周度','30':'月度','90':'季度','180':'半年度','365':'年度'}
for i in df.groupby('count'):
# 获取 i[1] 的索引值
index = i[1].index
pindu_df = pd.DataFrame()
pindu_df[pindudict[str(i[0])]+f'({len(i[1])})'] = index
# 合并到pindu_dfs
pindu_dfs = pd.concat([pindu_dfs,pindu_df],axis=1)
# nan替换为 ' '
pindu_dfs = pindu_dfs.fillna('')
pindu_dfs.to_csv('D:\code\huarongqiming\pindu.csv',index=False)
print(pindu_dfs)
print('*'*200)