石油焦铝用月度数据处理
This commit is contained in:
parent
7788cfda6f
commit
d17f7eba4a
3
.gitignore
vendored
3
.gitignore
vendored
@ -141,4 +141,5 @@ dmypy.json
|
|||||||
# Cython debug symbols
|
# Cython debug symbols
|
||||||
cython_debug/
|
cython_debug/
|
||||||
|
|
||||||
这是同步的git项目.txt
|
这是同步的git项目.txt
|
||||||
|
.vscode/launch.json
|
||||||
|
@ -409,7 +409,7 @@ login_data = {
|
|||||||
upload_data = {
|
upload_data = {
|
||||||
"groupNo": '', # 用户组id
|
"groupNo": '', # 用户组id
|
||||||
"funcModule": '研究报告信息',
|
"funcModule": '研究报告信息',
|
||||||
"funcOperation": '上传原油价格预测报告',
|
"funcOperation": '上传石油焦铝用价格预测报告',
|
||||||
"data": {
|
"data": {
|
||||||
"ownerAccount": 'arui', # 报告所属用户账号
|
"ownerAccount": 'arui', # 报告所属用户账号
|
||||||
"reportType": 'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST
|
"reportType": 'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST
|
||||||
@ -426,8 +426,8 @@ upload_data = {
|
|||||||
|
|
||||||
warning_data = {
|
warning_data = {
|
||||||
"groupNo": '', # 用户组id
|
"groupNo": '', # 用户组id
|
||||||
"funcModule": '原油特征停更预警',
|
"funcModule": '石油焦铝用特征停更预警',
|
||||||
"funcOperation": '原油特征停更预警',
|
"funcOperation": '石油焦铝用特征停更预警',
|
||||||
"data": {
|
"data": {
|
||||||
'WARNING_TYPE_NAME': '特征数据停更预警',
|
'WARNING_TYPE_NAME': '特征数据停更预警',
|
||||||
'WARNING_CONTENT': '',
|
'WARNING_CONTENT': '',
|
||||||
|
102
lib/dataread.py
102
lib/dataread.py
@ -941,9 +941,9 @@ def zhoududatachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time
|
|||||||
# 删除预测列空值的行
|
# 删除预测列空值的行
|
||||||
''' 工作日缺失,如果删除,会影响预测结果,导致统计准确率出错 '''
|
''' 工作日缺失,如果删除,会影响预测结果,导致统计准确率出错 '''
|
||||||
# df = df.dropna(subset=['y'])
|
# df = df.dropna(subset=['y'])
|
||||||
config.logger.info(f'删除预测列为空值的行后数据量:{df.shape}')
|
# config.logger.info(f'删除预测列为空值的行后数据量:{df.shape}')
|
||||||
df = df.dropna(axis=1, how='all')
|
# df = df.dropna(axis=1, how='all')
|
||||||
config.logger.info(f'删除全为空值的列后数据量:{df.shape}')
|
# config.logger.info(f'删除全为空值的列后数据量:{df.shape}')
|
||||||
df.to_csv(os.path.join(dataset, '未填充的特征数据.csv'), index=False)
|
df.to_csv(os.path.join(dataset, '未填充的特征数据.csv'), index=False)
|
||||||
# 去掉指标列表中的columns_to_drop的行
|
# 去掉指标列表中的columns_to_drop的行
|
||||||
df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin(
|
df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin(
|
||||||
@ -972,6 +972,7 @@ def zhoududatachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time
|
|||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y='y', dataset='dataset', delweekenday=False, add_kdj=False, is_timefurture=False):
|
def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y='y', dataset='dataset', delweekenday=False, add_kdj=False, is_timefurture=False):
|
||||||
'''
|
'''
|
||||||
聚烯烃特征数据处理函数,
|
聚烯烃特征数据处理函数,
|
||||||
@ -1071,6 +1072,101 @@ def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_t
|
|||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def yuedudatachuli_shiyoujiaolvyong(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y='y', dataset='dataset', delweekenday=False, add_kdj=False, is_timefurture=False):
|
||||||
|
'''
|
||||||
|
原油特征周度数据处理函数,
|
||||||
|
接收的是两个df,一个是指标数据,一个是指标列表
|
||||||
|
输出的是一个df,包含ds,y,指标列
|
||||||
|
'''
|
||||||
|
df = df_zhibiaoshuju.copy()
|
||||||
|
|
||||||
|
if end_time == '':
|
||||||
|
end_time = datetime.datetime.now().strftime('%Y-%m-%d')
|
||||||
|
# 重命名时间列,预测列
|
||||||
|
df.rename(columns={datecol: 'ds'}, inplace=True)
|
||||||
|
df.rename(columns={y: 'y'}, inplace=True)
|
||||||
|
# 按时间顺序排列
|
||||||
|
df.sort_values(by='ds', inplace=True)
|
||||||
|
df['ds'] = pd.to_datetime(df['ds'])
|
||||||
|
# 获取start_year年到end_time的数据
|
||||||
|
df = df[df['ds'].dt.year >= config.start_year]
|
||||||
|
df = df[df['ds'] <= end_time]
|
||||||
|
# last_update_times_df,y_last_update_time = create_feature_last_update_time(df)
|
||||||
|
# config.logger.info(f'删除预警的特征前数据量:{df.shape}')
|
||||||
|
# columns_to_drop = last_update_times_df[last_update_times_df['warning_date'] < y_last_update_time ]['feature'].values.tolist()
|
||||||
|
# df = df.drop(columns = columns_to_drop)
|
||||||
|
# config.logger.info(f'删除预警的特征后数据量:{df.shape}')
|
||||||
|
# if is_update_warning_data:
|
||||||
|
# upload_warning_info(last_update_times_df,y_last_update_time)
|
||||||
|
# 去掉近最后数据对应的日期在六月以前的列,删除近2月的数据是常熟的列
|
||||||
|
if config.is_del_tow_month:
|
||||||
|
current_date = datetime.datetime.now()
|
||||||
|
two_months_ago = current_date - timedelta(days=180)
|
||||||
|
config.logger.info(f'删除两月不更新特征前数据量:{df.shape}')
|
||||||
|
columns_to_drop = []
|
||||||
|
for clo in df.columns:
|
||||||
|
if check_column(df, clo, two_months_ago):
|
||||||
|
columns_to_drop.append(clo)
|
||||||
|
df = df.drop(columns=columns_to_drop)
|
||||||
|
|
||||||
|
config.logger.info(f'删除两月不更新特征后数据量:{df.shape}')
|
||||||
|
|
||||||
|
if config.freq == 'W':
|
||||||
|
# 按周取样
|
||||||
|
df = df.resample('W', on='ds').mean().reset_index()
|
||||||
|
elif config.freq == 'M':
|
||||||
|
# 按月取样
|
||||||
|
df = df.resample('M', on='ds').mean().reset_index()
|
||||||
|
# 删除预测列空值的行
|
||||||
|
''' 工作日缺失,如果删除,会影响预测结果,导致统计准确率出错 '''
|
||||||
|
# df = df.dropna(subset=['y'])
|
||||||
|
config.logger.info(f'删除预测列为空值的行后数据量:{df.shape}')
|
||||||
|
df = df.dropna(axis=1, how='all')
|
||||||
|
config.logger.info(f'删除全为空值的列后数据量:{df.shape}')
|
||||||
|
df.to_csv(os.path.join(dataset, '未填充的特征数据.csv'), index=False)
|
||||||
|
# 去掉指标列表中的columns_to_drop的行
|
||||||
|
df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin(
|
||||||
|
df.columns.tolist())]
|
||||||
|
df_zhibiaoliebiao.to_csv(os.path.join(
|
||||||
|
dataset, '特征处理后的指标名称及分类.csv'), index=False)
|
||||||
|
# 数据频度分析
|
||||||
|
featurePindu(dataset=dataset)
|
||||||
|
# 填充开始日期到df中最小日期的数据
|
||||||
|
if df['ds'].min() > datetime.datetime(config.start_year, 1, 1):
|
||||||
|
# 生成从2000-01-31到最小日期的月末日期序列
|
||||||
|
start_date = datetime.datetime(config.start_year, 1, 1)
|
||||||
|
end_date = df['ds'].min() - pd.DateOffset(days=1) # 最小日期前一天
|
||||||
|
date_range = pd.date_range(
|
||||||
|
start=start_date,
|
||||||
|
end=end_date,
|
||||||
|
freq='M' # 每月最后一天
|
||||||
|
)
|
||||||
|
|
||||||
|
# 创建新DataFrame并合并
|
||||||
|
new_rows = pd.DataFrame({'ds': date_range})
|
||||||
|
df = pd.concat([new_rows, df]).sort_values('ds').reset_index(drop=True)
|
||||||
|
# 向上填充
|
||||||
|
df = df.ffill()
|
||||||
|
# 向下填充
|
||||||
|
df = df.bfill()
|
||||||
|
|
||||||
|
# 删除周六日的数据
|
||||||
|
if delweekenday:
|
||||||
|
df = df[df['ds'].dt.weekday < 5]
|
||||||
|
|
||||||
|
# kdj指标
|
||||||
|
if add_kdj:
|
||||||
|
df = calculate_kdj(df)
|
||||||
|
# 衍生时间特征
|
||||||
|
if is_timefurture:
|
||||||
|
df = addtimecharacteristics(df=df, dataset=dataset)
|
||||||
|
# 特征分析
|
||||||
|
featureAnalysis(df, dataset=dataset, y=y)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getdata(filename, datecol='date', y='y', dataset='', add_kdj=False, is_timefurture=False, end_time=''):
|
def getdata(filename, datecol='date', y='y', dataset='', add_kdj=False, is_timefurture=False, end_time=''):
|
||||||
config.logger.info('getdata接收:'+filename+' '+datecol+' '+end_time)
|
config.logger.info('getdata接收:'+filename+' '+datecol+' '+end_time)
|
||||||
# 判断后缀名 csv或excel
|
# 判断后缀名 csv或excel
|
||||||
|
@ -254,7 +254,7 @@ def predict_main():
|
|||||||
df_zhibiaoliebiao.to_excel(file, sheet_name='指标列表', index=False)
|
df_zhibiaoliebiao.to_excel(file, sheet_name='指标列表', index=False)
|
||||||
|
|
||||||
# 数据处理
|
# 数据处理
|
||||||
df = zhoududatachuli(df_zhibiaoshuju, df_zhibiaoliebiao, y=global_config['y'], dataset=dataset, add_kdj=add_kdj, is_timefurture=is_timefurture,
|
df = yuedudatachuli_shiyoujiaolvyong(df_zhibiaoshuju, df_zhibiaoliebiao, y=global_config['y'], dataset=dataset, add_kdj=add_kdj, is_timefurture=is_timefurture,
|
||||||
end_time=end_time)
|
end_time=end_time)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user