# 统计特征频度 # 读取文件 import pandas as pd df = pd.read_csv("D:\code\huarongqiming\碳酸锂合并数据.csv",encoding='gbk') df['ds'] = pd.to_datetime(df['ds']) # 按ds正序排序,重置索引 df = df.sort_values(by='ds', ascending=True).reset_index(drop=True) # 统计特征频度 # 每列随机抽取6个值,计算出5个时间间隔,统计每个时间间隔的频度 columns = df.columns.to_list() columns.remove('ds') count_dict = {} for column in columns: # 获取每列时间间隔 values = df[[column,'ds']] values.dropna(inplace=True,axis=0) values=values.reset_index(drop=True) # 抽取10个值 value = values.sample(10) index = value.index next_index = index + 1 count = [] for i,j in zip(index, next_index): #通过索引计算日期差 try: count.append((values.loc[j,'ds'] - values.loc[i,'ds']).days) except: pass # 把31 换成 30 count = [30 if i == 31 else i for i in count] # 保留count中出现次数最多的数 count = max(set(count), key=count.count) # 存储到字典中 count_dict[column] = count df = pd.DataFrame(count_dict,index=['count']).T pindu_dfs = pd.DataFrame() # 根据count分组 # 输出特征频度统计 pindudict = {'1':'日度','7':'周度','30':'月度','90':'季度','180':'半年度','365':'年度'} for i in df.groupby('count'): # 获取 i[1] 的索引值 index = i[1].index pindu_df = pd.DataFrame() pindu_df[pindudict[str(i[0])]+f'({len(i[1])})'] = index # 合并到pindu_dfs pindu_dfs = pd.concat([pindu_dfs,pindu_df],axis=1) # nan替换为 ' ' pindu_dfs = pindu_dfs.fillna('') pindu_dfs.to_csv('D:\code\huarongqiming\pindu.csv',index=False) print(pindu_dfs) print('*'*200)