From d17f7eba4af46dfe83022f949258181359ec6fd4 Mon Sep 17 00:00:00 2001 From: jingboyitiji Date: Wed, 28 May 2025 10:20:48 +0800 Subject: [PATCH] =?UTF-8?q?=E7=9F=B3=E6=B2=B9=E7=84=A6=E9=93=9D=E7=94=A8?= =?UTF-8?q?=E6=9C=88=E5=BA=A6=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 +- config_shiyoujiao_lvyong_yuedu.py | 6 +- lib/dataread.py | 102 +++++++++++++++++++++++++++++- main_shiyoujiao_lvyong_yuedu.py | 2 +- 4 files changed, 105 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 698e63d..0453d8d 100644 --- a/.gitignore +++ b/.gitignore @@ -141,4 +141,5 @@ dmypy.json # Cython debug symbols cython_debug/ -这是同步的git项目.txt \ No newline at end of file +这是同步的git项目.txt +.vscode/launch.json diff --git a/config_shiyoujiao_lvyong_yuedu.py b/config_shiyoujiao_lvyong_yuedu.py index 5570610..40361cd 100644 --- a/config_shiyoujiao_lvyong_yuedu.py +++ b/config_shiyoujiao_lvyong_yuedu.py @@ -409,7 +409,7 @@ login_data = { upload_data = { "groupNo": '', # 用户组id "funcModule": '研究报告信息', - "funcOperation": '上传原油价格预测报告', + "funcOperation": '上传石油焦铝用价格预测报告', "data": { "ownerAccount": 'arui', # 报告所属用户账号 "reportType": 'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST @@ -426,8 +426,8 @@ upload_data = { warning_data = { "groupNo": '', # 用户组id - "funcModule": '原油特征停更预警', - "funcOperation": '原油特征停更预警', + "funcModule": '石油焦铝用特征停更预警', + "funcOperation": '石油焦铝用特征停更预警', "data": { 'WARNING_TYPE_NAME': '特征数据停更预警', 'WARNING_CONTENT': '', diff --git a/lib/dataread.py b/lib/dataread.py index f8475c0..6ae6853 100644 --- a/lib/dataread.py +++ b/lib/dataread.py @@ -941,9 +941,9 @@ def zhoududatachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time # 删除预测列空值的行 ''' 工作日缺失,如果删除,会影响预测结果,导致统计准确率出错 ''' # df = df.dropna(subset=['y']) - config.logger.info(f'删除预测列为空值的行后数据量:{df.shape}') - df = df.dropna(axis=1, how='all') - config.logger.info(f'删除全为空值的列后数据量:{df.shape}') + # config.logger.info(f'删除预测列为空值的行后数据量:{df.shape}') + # df = df.dropna(axis=1, how='all') + # config.logger.info(f'删除全为空值的列后数据量:{df.shape}') df.to_csv(os.path.join(dataset, '未填充的特征数据.csv'), index=False) # 去掉指标列表中的columns_to_drop的行 df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin( @@ -972,6 +972,7 @@ def zhoududatachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time return df + def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y='y', dataset='dataset', delweekenday=False, add_kdj=False, is_timefurture=False): ''' 聚烯烃特征数据处理函数, @@ -1071,6 +1072,101 @@ def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_t return df +def yuedudatachuli_shiyoujiaolvyong(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y='y', dataset='dataset', delweekenday=False, add_kdj=False, is_timefurture=False): + ''' + 原油特征周度数据处理函数, + 接收的是两个df,一个是指标数据,一个是指标列表 + 输出的是一个df,包含ds,y,指标列 + ''' + df = df_zhibiaoshuju.copy() + + if end_time == '': + end_time = datetime.datetime.now().strftime('%Y-%m-%d') + # 重命名时间列,预测列 + df.rename(columns={datecol: 'ds'}, inplace=True) + df.rename(columns={y: 'y'}, inplace=True) + # 按时间顺序排列 + df.sort_values(by='ds', inplace=True) + df['ds'] = pd.to_datetime(df['ds']) + # 获取start_year年到end_time的数据 + df = df[df['ds'].dt.year >= config.start_year] + df = df[df['ds'] <= end_time] + # last_update_times_df,y_last_update_time = create_feature_last_update_time(df) + # config.logger.info(f'删除预警的特征前数据量:{df.shape}') + # columns_to_drop = last_update_times_df[last_update_times_df['warning_date'] < y_last_update_time ]['feature'].values.tolist() + # df = df.drop(columns = columns_to_drop) + # config.logger.info(f'删除预警的特征后数据量:{df.shape}') + # if is_update_warning_data: + # upload_warning_info(last_update_times_df,y_last_update_time) + # 去掉近最后数据对应的日期在六月以前的列,删除近2月的数据是常熟的列 + if config.is_del_tow_month: + current_date = datetime.datetime.now() + two_months_ago = current_date - timedelta(days=180) + config.logger.info(f'删除两月不更新特征前数据量:{df.shape}') + columns_to_drop = [] + for clo in df.columns: + if check_column(df, clo, two_months_ago): + columns_to_drop.append(clo) + df = df.drop(columns=columns_to_drop) + + config.logger.info(f'删除两月不更新特征后数据量:{df.shape}') + + if config.freq == 'W': + # 按周取样 + df = df.resample('W', on='ds').mean().reset_index() + elif config.freq == 'M': + # 按月取样 + df = df.resample('M', on='ds').mean().reset_index() + # 删除预测列空值的行 + ''' 工作日缺失,如果删除,会影响预测结果,导致统计准确率出错 ''' + # df = df.dropna(subset=['y']) + config.logger.info(f'删除预测列为空值的行后数据量:{df.shape}') + df = df.dropna(axis=1, how='all') + config.logger.info(f'删除全为空值的列后数据量:{df.shape}') + df.to_csv(os.path.join(dataset, '未填充的特征数据.csv'), index=False) + # 去掉指标列表中的columns_to_drop的行 + df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin( + df.columns.tolist())] + df_zhibiaoliebiao.to_csv(os.path.join( + dataset, '特征处理后的指标名称及分类.csv'), index=False) + # 数据频度分析 + featurePindu(dataset=dataset) + # 填充开始日期到df中最小日期的数据 + if df['ds'].min() > datetime.datetime(config.start_year, 1, 1): + # 生成从2000-01-31到最小日期的月末日期序列 + start_date = datetime.datetime(config.start_year, 1, 1) + end_date = df['ds'].min() - pd.DateOffset(days=1) # 最小日期前一天 + date_range = pd.date_range( + start=start_date, + end=end_date, + freq='M' # 每月最后一天 + ) + + # 创建新DataFrame并合并 + new_rows = pd.DataFrame({'ds': date_range}) + df = pd.concat([new_rows, df]).sort_values('ds').reset_index(drop=True) + # 向上填充 + df = df.ffill() + # 向下填充 + df = df.bfill() + + # 删除周六日的数据 + if delweekenday: + df = df[df['ds'].dt.weekday < 5] + + # kdj指标 + if add_kdj: + df = calculate_kdj(df) + # 衍生时间特征 + if is_timefurture: + df = addtimecharacteristics(df=df, dataset=dataset) + # 特征分析 + featureAnalysis(df, dataset=dataset, y=y) + return df + + + + def getdata(filename, datecol='date', y='y', dataset='', add_kdj=False, is_timefurture=False, end_time=''): config.logger.info('getdata接收:'+filename+' '+datecol+' '+end_time) # 判断后缀名 csv或excel diff --git a/main_shiyoujiao_lvyong_yuedu.py b/main_shiyoujiao_lvyong_yuedu.py index d2baba9..83c3ea1 100644 --- a/main_shiyoujiao_lvyong_yuedu.py +++ b/main_shiyoujiao_lvyong_yuedu.py @@ -254,7 +254,7 @@ def predict_main(): df_zhibiaoliebiao.to_excel(file, sheet_name='指标列表', index=False) # 数据处理 - df = zhoududatachuli(df_zhibiaoshuju, df_zhibiaoliebiao, y=global_config['y'], dataset=dataset, add_kdj=add_kdj, is_timefurture=is_timefurture, + df = yuedudatachuli_shiyoujiaolvyong(df_zhibiaoshuju, df_zhibiaoliebiao, y=global_config['y'], dataset=dataset, add_kdj=add_kdj, is_timefurture=is_timefurture, end_time=end_time) else: