原油特征处理,删除手工数据

This commit is contained in:
workpc 2024-11-15 14:47:57 +08:00
parent 757bc5317a
commit 061d38a621
9 changed files with 34835 additions and 23939 deletions

Binary file not shown.

Binary file not shown.

View File

@ -20,8 +20,8 @@ plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
from datetime import timedelta from datetime import timedelta
# from config_jingbo import * from config_jingbo import *
from config_juxiting import * # from config_juxiting import *
from sklearn import metrics from sklearn import metrics
from reportlab.pdfbase import pdfmetrics # 注册字体 from reportlab.pdfbase import pdfmetrics # 注册字体
from reportlab.pdfbase.ttfonts import TTFont # 字体类 from reportlab.pdfbase.ttfonts import TTFont # 字体类
@ -473,9 +473,8 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y'
values = values + [column, last_update_time,0] values = values + [column, last_update_time,0]
# 计算特征数据值的时间差 # 计算特征数据值的时间差
try: try:
time_diff = (df1[column].dropna().index.to_series().diff().mode()[0]).total_seconds() / 3600 / 24
# 计算预警日期 # 计算预警日期
time_diff = (df1[column].dropna().index.to_series().diff().mode()[0]).total_seconds() / 3600 / 24
from datetime import timedelta from datetime import timedelta
early_warning_date = datetime.datetime.strptime(last_update_time, '%Y-%m-%d') + timedelta(days=time_diff)*2 early_warning_date = datetime.datetime.strptime(last_update_time, '%Y-%m-%d') + timedelta(days=time_diff)*2
early_warning_date = early_warning_date.strftime('%Y-%m-%d') early_warning_date = early_warning_date.strftime('%Y-%m-%d')
@ -492,9 +491,9 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y'
logger.info(f'删除两月不更新特征前数据量:{df.shape}') logger.info(f'删除两月不更新特征前数据量:{df.shape}')
# 去掉近最后数据对应的日期在月以前的列删除近2月的数据是常熟的列 # 去掉近最后数据对应的日期在月以前的列删除近2月的数据是常熟的列
current_date = datetime.datetime.now() current_date = datetime.datetime.now()
two_months_ago = current_date - timedelta(days=40) two_months_ago = current_date - timedelta(days=180)
def check_column(col_name): def check_column(col_name):
''' '''
@ -507,8 +506,11 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y'
df_check_column = df[['ds',col_name]] df_check_column = df[['ds',col_name]]
df_check_column = df_check_column.dropna() df_check_column = df_check_column.dropna()
if len(df_check_column) == 0: if len(df_check_column) == 0:
print(f'空值列:{col_name}')
return True return True
if df_check_column[(df_check_column['ds']>= two_months_ago)].groupby(col_name).ngroups < 2: # 判断是不是常数列
if df_check_column[(df_check_column['ds']>= two_months_ago)].groupby(col_name).ngroups < 2:
print(f'180没有更新{col_name}')
return True return True
corresponding_date = df_check_column.iloc[-1]['ds'] corresponding_date = df_check_column.iloc[-1]['ds']
return corresponding_date < two_months_ago return corresponding_date < two_months_ago
@ -1181,7 +1183,7 @@ class EtaReader():
''' '''
# 构建新的DataFrame df df1 # 构建新的DataFrame df df1
df = pd.DataFrame(columns=['指标分类', '指标名称', '指标id', '频度']) df = pd.DataFrame(columns=['指标分类', '指标名称', '指标id', '频度','指标来源','来源id'])
df1 = pd.DataFrame(columns=['DataTime']) df1 = pd.DataFrame(columns=['DataTime'])
@ -1225,16 +1227,27 @@ class EtaReader():
EdbCode = i.get('EdbCode') EdbCode = i.get('EdbCode')
EdbName = i.get('EdbName') # 指标名称要保存到df2的指标名称列,df的指标名称列 EdbName = i.get('EdbName') # 指标名称要保存到df2的指标名称列,df的指标名称列
Frequency = i.get('Frequency') # 频度要保存到df的频度列 Frequency = i.get('Frequency') # 频度要保存到df的频度列
SourceName = i.get('SourceName') # 来源名称要保存到df的频度列
Source = i.get('Source') # 来源ID要保存到df的频度列
# 频度不是 日 或者 周的 跳过 # 频度不是 日 或者 周的 跳过
if Frequency not in ['日度','周度','','']: if Frequency not in ['日度','周度','','']:
continue continue
# 只保留手工数据中,名称带有 海运出口 海运进口
if Source == 9 and not ('海运出口' in EdbName or '海运进口' in EdbName):
continue
# 不要wind数据
if Source == 2:
continue
# 判断名称是否需要保存 # 判断名称是否需要保存
isSave = self.filter_yuanyou_data(ClassifyName,EdbName) isSave = self.filter_yuanyou_data(ClassifyName,EdbName)
if isSave: if isSave:
# 保存到df # 保存到df
# 保存频度 指标名称 分类 指标id 到 df # 保存频度 指标名称 分类 指标id 到 df
df2 = pd.DataFrame({'指标分类': ClassifyName, '指标名称': EdbName, '指标id': EdbCode, '频度': Frequency},index=[0]) df2 = pd.DataFrame({'指标分类': ClassifyName, '指标名称': EdbName, '指标id': EdbCode, '频度': Frequency,'指标来源':SourceName,'来源id':Source},index=[0])
# df = pd.merge(df, df2, how='outer') # df = pd.merge(df, df2, how='outer')
df = pd.concat([df, df2]) df = pd.concat([df, df2])
@ -1255,7 +1268,7 @@ class EtaReader():
itemname = item itemname = item
df1 = self.edbcodegetdata(df1,item,itemname) df1 = self.edbcodegetdata(df1,item,itemname)
df = pd.concat([df, pd.DataFrame({'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他'},index=[0])]) df = pd.concat([df, pd.DataFrame({'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他','指标来源':'其他','来源id':'其他'},index=[0])])
# 按时间排序 # 按时间排序
df1.sort_values('DataTime',inplace=True,ascending=False) df1.sort_values('DataTime',inplace=True,ascending=False)

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1920,7 +1920,7 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu
### 添加标题 ### 添加标题
content.append(Graphs.draw_title(f'{y}{time}预测报告')) content.append(Graphs.draw_title(f'{y}{time}预测报告'))
### 预测结果 ### 预测结果
content.append(Graphs.draw_little_title('一、预测结果:')) content.append(Graphs.draw_little_title('一、预测结果:'))
# 添加图片 # 添加图片