55 lines
1.7 KiB
Python
55 lines
1.7 KiB
Python
# 统计特征频度
|
||
|
||
# 读取文件
|
||
import pandas as pd
|
||
df = pd.read_csv("D:\code\huarongqiming\碳酸锂合并数据.csv",encoding='gbk')
|
||
df['ds'] = pd.to_datetime(df['ds'])
|
||
# 按ds正序排序,重置索引
|
||
df = df.sort_values(by='ds', ascending=True).reset_index(drop=True)
|
||
|
||
# 统计特征频度
|
||
# 每列随机抽取6个值,计算出5个时间间隔,统计每个时间间隔的频度
|
||
columns = df.columns.to_list()
|
||
columns.remove('ds')
|
||
count_dict = {}
|
||
for column in columns:
|
||
# 获取每列时间间隔
|
||
values = df[[column,'ds']]
|
||
values.dropna(inplace=True,axis=0)
|
||
values=values.reset_index(drop=True)
|
||
|
||
# 抽取10个值
|
||
value = values.sample(10)
|
||
index = value.index
|
||
next_index = index + 1
|
||
count = []
|
||
for i,j in zip(index, next_index):
|
||
#通过索引计算日期差
|
||
try:
|
||
count.append((values.loc[j,'ds'] - values.loc[i,'ds']).days)
|
||
except:
|
||
pass
|
||
# 把31 换成 30
|
||
count = [30 if i == 31 else i for i in count]
|
||
# 保留count中出现次数最多的数
|
||
count = max(set(count), key=count.count)
|
||
# 存储到字典中
|
||
count_dict[column] = count
|
||
|
||
df = pd.DataFrame(count_dict,index=['count']).T
|
||
pindu_dfs = pd.DataFrame()
|
||
# 根据count分组
|
||
# 输出特征频度统计
|
||
pindudict = {'1':'日度','7':'周度','30':'月度','90':'季度','180':'半年度','365':'年度'}
|
||
for i in df.groupby('count'):
|
||
# 获取 i[1] 的索引值
|
||
index = i[1].index
|
||
pindu_df = pd.DataFrame()
|
||
pindu_df[pindudict[str(i[0])]+f'({len(i[1])})'] = index
|
||
# 合并到pindu_dfs
|
||
pindu_dfs = pd.concat([pindu_dfs,pindu_df],axis=1)
|
||
# nan替换为 ' '
|
||
pindu_dfs = pindu_dfs.fillna('')
|
||
pindu_dfs.to_csv('D:\code\huarongqiming\pindu.csv',index=False)
|
||
print(pindu_dfs)
|
||
print('*'*200) |