55 lines
1.7 KiB
Python
55 lines
1.7 KiB
Python
|
# 统计特征频度
|
|||
|
|
|||
|
# 读取文件
|
|||
|
import pandas as pd
|
|||
|
df = pd.read_csv("D:\code\huarongqiming\碳酸锂合并数据.csv",encoding='gbk')
|
|||
|
df['ds'] = pd.to_datetime(df['ds'])
|
|||
|
# 按ds正序排序,重置索引
|
|||
|
df = df.sort_values(by='ds', ascending=True).reset_index(drop=True)
|
|||
|
|
|||
|
# 统计特征频度
|
|||
|
# 每列随机抽取6个值,计算出5个时间间隔,统计每个时间间隔的频度
|
|||
|
columns = df.columns.to_list()
|
|||
|
columns.remove('ds')
|
|||
|
count_dict = {}
|
|||
|
for column in columns:
|
|||
|
# 获取每列时间间隔
|
|||
|
values = df[[column,'ds']]
|
|||
|
values.dropna(inplace=True,axis=0)
|
|||
|
values=values.reset_index(drop=True)
|
|||
|
|
|||
|
# 抽取10个值
|
|||
|
value = values.sample(10)
|
|||
|
index = value.index
|
|||
|
next_index = index + 1
|
|||
|
count = []
|
|||
|
for i,j in zip(index, next_index):
|
|||
|
#通过索引计算日期差
|
|||
|
try:
|
|||
|
count.append((values.loc[j,'ds'] - values.loc[i,'ds']).days)
|
|||
|
except:
|
|||
|
pass
|
|||
|
# 把31 换成 30
|
|||
|
count = [30 if i == 31 else i for i in count]
|
|||
|
# 保留count中出现次数最多的数
|
|||
|
count = max(set(count), key=count.count)
|
|||
|
# 存储到字典中
|
|||
|
count_dict[column] = count
|
|||
|
|
|||
|
df = pd.DataFrame(count_dict,index=['count']).T
|
|||
|
pindu_dfs = pd.DataFrame()
|
|||
|
# 根据count分组
|
|||
|
# 输出特征频度统计
|
|||
|
pindudict = {'1':'日度','7':'周度','30':'月度','90':'季度','180':'半年度','365':'年度'}
|
|||
|
for i in df.groupby('count'):
|
|||
|
# 获取 i[1] 的索引值
|
|||
|
index = i[1].index
|
|||
|
pindu_df = pd.DataFrame()
|
|||
|
pindu_df[pindudict[str(i[0])]+f'({len(i[1])})'] = index
|
|||
|
# 合并到pindu_dfs
|
|||
|
pindu_dfs = pd.concat([pindu_dfs,pindu_df],axis=1)
|
|||
|
# nan替换为 ' '
|
|||
|
pindu_dfs = pindu_dfs.fillna('')
|
|||
|
pindu_dfs.to_csv('D:\code\huarongqiming\pindu.csv',index=False)
|
|||
|
print(pindu_dfs)
|
|||
|
print('*'*200)
|