From 3b0011ceebf5054bec001bdd1396f31ae2210ba5 Mon Sep 17 00:00:00 2001 From: jingboyitiji Date: Tue, 11 Mar 2025 11:25:43 +0800 Subject: [PATCH] =?UTF-8?q?=E8=81=9A=E7=83=AF=E7=83=83=E6=97=A5=E5=BA=A6?= =?UTF-8?q?=E8=B0=83=E8=AF=95=E9=80=9A=E8=BF=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config_juxiting.py | 4 +- main_juxiting.py | 5 +- models/nerulforcastmodels.py | 315 +++++++++++++++++------------------ 3 files changed, 162 insertions(+), 162 deletions(-) diff --git a/config_juxiting.py b/config_juxiting.py index 2977718..26b8bd0 100644 --- a/config_juxiting.py +++ b/config_juxiting.py @@ -202,8 +202,8 @@ table_name = 'v_tbl_crude_oil_warning' # 开关 is_train = True # 是否训练 -is_debug = True # 是否调试 -is_eta = False # 是否使用eta接口 +is_debug = False # 是否调试 +is_eta = True # 是否使用eta接口 is_market = False # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效 is_timefurture = True # 是否使用时间特征 is_fivemodels = False # 是否使用之前保存的最佳的5个模型 diff --git a/main_juxiting.py b/main_juxiting.py index 1e4054b..9126724 100644 --- a/main_juxiting.py +++ b/main_juxiting.py @@ -39,6 +39,7 @@ global_config.update({ 'is_del_tow_month': is_del_tow_month, 'is_eta': is_eta, 'is_update_eta': is_update_eta, + 'is_fivemodels': is_fivemodels, 'early_stop_patience_steps': early_stop_patience_steps, # 时间参数 @@ -339,13 +340,13 @@ def predict_main(): logger.info('模型训练完成') logger.info('训练数据绘图ing') - model_results3 = model_losss_juxiting(sqlitedb, end_time=end_time) + model_results3 = model_losss_juxiting(sqlitedb, end_time=global_config['end_time'],is_fivemodels=global_config['is_fivemodels']) logger.info('训练数据绘图end') # # 模型报告 logger.info('制作报告ing') title = f'{settings}--{end_time}-预测报告' # 报告标题 - reportname = f'Brent原油大模型月度预测--{end_time}.pdf' # 报告文件名 + reportname = f'Brent原油大模型日度预测--{end_time}.pdf' # 报告文件名 reportname = reportname.replace(':', '-') # 替换冒号 pp_export_pdf(dataset=dataset, num_models=5 if is_fivemodels else 22, time=end_time, reportname=reportname, sqlitedb=sqlitedb), diff --git a/models/nerulforcastmodels.py b/models/nerulforcastmodels.py index 5824b6e..2cd550c 100644 --- a/models/nerulforcastmodels.py +++ b/models/nerulforcastmodels.py @@ -173,9 +173,9 @@ def ex_Model(df, horizon, input_size, train_steps, val_check_steps, early_stop_p if is_fivemodels: # 获取之前存好的最好的五个模型 - with open(os.path.join(dataset, 'best_modelnames.txt'), 'r', encoding='utf-8') as f: + with open(os.path.join(config.dataset, 'best_modelnames.txt'), 'r', encoding='utf-8') as f: best_modelnames = f.readlines()[0] - logger.info(f'获取本地最佳模型名称:{best_modelnames}') + config.logger.info(f'获取本地最佳模型名称:{best_modelnames}') # 重新拼接models all_models = models @@ -194,7 +194,7 @@ def ex_Model(df, horizon, input_size, train_steps, val_check_steps, early_stop_p nf_preds = nf.cross_validation( df=df_train, val_size=val_size, test_size=test_size, n_windows=None) nf_preds.to_csv(os.path.join( - dataset, "cross_validation.csv"), index=False) + config.dataset,"cross_validation.csv"), index=False) nf_preds = nf_preds.reset_index() # 保存模型 @@ -202,12 +202,12 @@ def ex_Model(df, horizon, input_size, train_steps, val_check_steps, early_stop_p filename = f'{settings}--{now}.joblib' # 文件名去掉冒号 filename = filename.replace(':', '-') # 替换冒号 - dump(nf, os.path.join(dataset, filename)) + dump(nf, os.path.join(config.dataset, filename)) else: # glob获取dataset下最新的joblib文件 import glob filename = max(glob.glob(os.path.join( - dataset, '*.joblib')), key=os.path.getctime) + config.dataset,'*.joblib')), key=os.path.getctime) config.logger.info('读取模型:' + filename) nf = load(filename) # 测试集预测 @@ -215,7 +215,7 @@ def ex_Model(df, horizon, input_size, train_steps, val_check_steps, early_stop_p df=df_test, val_size=val_size, test_size=test_size, n_windows=None) # 测试集预测结果保存 nf_test_preds.to_csv(os.path.join( - dataset, "cross_validation.csv"), index=False) + config.dataset,"cross_validation.csv"), index=False) df_test['ds'] = pd.to_datetime(df_test['ds'], errors='coerce') @@ -231,7 +231,7 @@ def ex_Model(df, horizon, input_size, train_steps, val_check_steps, early_stop_p df_predict['created_dt'] = end_time # 保存预测值 - df_predict.to_csv(os.path.join(dataset, "predict.csv"), index=False) + df_predict.to_csv(os.path.join(config.dataset, "predict.csv"), index=False) # 将预测结果保存到数据库 save_to_database(config.sqlitedb, df_predict, 'predict', end_time) @@ -330,11 +330,11 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear df_test['unique_id'] = 1 # 显示划分后的数据集的前几行 - logger.info("Training set head:") - logger.info(df_train.head()) + config.logger.info("Training set head:") + config.logger.info(df_train.head()) - logger.info("\nTesting set head:") - logger.info(df_test.head()) + config.logger.info("\nTesting set head:") + config.logger.info(df_test.head()) models = [ NHITS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, @@ -391,9 +391,9 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear if is_fivemodels: # 获取之前存好的最好的五个模型 - with open(os.path.join(dataset, 'best_modelnames.txt'), 'r', encoding='utf-8') as f: + with open(os.path.join(config.dataset, 'best_modelnames.txt'), 'r', encoding='utf-8') as f: best_modelnames = f.readlines()[0] - logger.info(f'获取本地最佳模型名称:{best_modelnames}') + config.logger.info(f'获取本地最佳模型名称:{best_modelnames}') # 重新拼接models all_models = models @@ -411,7 +411,7 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear nf_preds = nf.cross_validation( df=df_train, val_size=val_size, test_size=test_size, n_windows=None) nf_preds.to_csv(os.path.join( - dataset, "cross_validation.csv"), index=False) + config.dataset,"cross_validation.csv"), index=False) nf_preds = nf_preds.reset_index() # 保存模型 @@ -419,20 +419,20 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear filename = f'{settings}--{now}.joblib' # 文件名去掉冒号 filename = filename.replace(':', '-') # 替换冒号 - dump(nf, os.path.join(dataset, filename)) + dump(nf, os.path.join(config.dataset, filename)) else: # glob获取dataset下最新的joblib文件 import glob filename = max(glob.glob(os.path.join( - dataset, '*.joblib')), key=os.path.getctime) - logger.info('读取模型:' + filename) + config.dataset,'*.joblib')), key=os.path.getctime) + config.logger.info('读取模型:' + filename) nf = load(filename) # 测试集预测 nf_test_preds = nf.cross_validation( df=df_test, val_size=val_size, test_size=test_size, n_windows=None) # 测试集预测结果保存 nf_test_preds.to_csv(os.path.join( - dataset, "cross_validation.csv"), index=False) + config.dataset,"cross_validation.csv"), index=False) df_test['ds'] = pd.to_datetime(df_test['ds'], errors='coerce') @@ -451,7 +451,7 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear df_predict['created_dt'] = end_time # 保存预测值 - df_predict.to_csv(os.path.join(dataset, "predict.csv"), index=False) + df_predict.to_csv(os.path.join(config.dataset, "predict.csv"), index=False) # 将预测结果保存到数据库 save_to_database(sqlitedb, df_predict, 'predict', end_time) @@ -485,18 +485,18 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): most_model_name = most_model[0] # 预测数据处理 predict - # df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv")) + # df_combined = loadcsv(os.path.join(config.dataset,"cross_validation.csv")) # df_combined = dateConvert(df_combined) df_combined = sqlitedb.select_data( 'accuracy', where_condition=f"created_dt <= '{end_time}'") df_combined4 = df_combined.copy() # 备份df_combined,后面画图需要 # 删除缺失值大于80%的列 - logger.info(df_combined.shape) + config.logger.info(df_combined.shape) df_combined = df_combined.loc[:, df_combined.isnull().mean() < 0.8] - logger.info(df_combined.shape) + config.logger.info(df_combined.shape) # 删除缺失值 df_combined.dropna(inplace=True) - logger.info(df_combined.shape) + config.logger.info(df_combined.shape) # 其他列转为数值类型 df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in [ 'CREAT_DATE', 'ds', 'created_dt']}) @@ -536,7 +536,7 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): model_results3 = model_results3.sort_values( by='平均平方误差(MSE)', ascending=True) model_results3.to_csv(os.path.join( - dataset, "model_evaluation.csv"), index=False) + config.dataset,"model_evaluation.csv"), index=False) modelnames = model_results3['模型(Model)'].tolist() allmodelnames = modelnames.copy() # 保存5个最佳模型的名称 @@ -545,7 +545,7 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): if is_fivemodels: pass else: - with open(os.path.join(dataset, "best_modelnames.txt"), 'w') as f: + with open(os.path.join(config.dataset, "best_modelnames.txt"), 'w') as f: f.write(','.join(modelnames) + '\n') # 预测值与真实值对比图 @@ -560,12 +560,12 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): plt.ylabel('价格') plt.title(model+'拟合') plt.subplots_adjust(hspace=0.5) - plt.savefig(os.path.join(dataset, '预测值与真实值对比图.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '预测值与真实值对比图.png'), bbox_inches='tight') plt.close() # # 历史数据+预测数据 # # 拼接未来时间预测 - df_predict = pd.read_csv(os.path.join(dataset, 'predict.csv')) + df_predict = pd.read_csv(os.path.join(config.dataset, 'predict.csv')) df_predict.drop('unique_id', inplace=True, axis=1) df_predict.dropna(axis=1, inplace=True) @@ -713,7 +713,7 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): sqlitedb.update_data( 'accuracy', f"min_price = {row['min_price'].values[0]},max_price = {row['max_price'].values[0]},mean={row['mean'].values[0]}", f"id = {id}") except: - logger.error(f'更新accuracy表中的min_price,max_price,mean值失败,row={row}') + config.logger.error(f'更新accuracy表中的min_price,max_price,mean值失败,row={row}') df = accuracy_df.copy() df['ds'] = pd.to_datetime(df['ds']) @@ -802,7 +802,7 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): except ValueError: pass df_combined3.to_csv(os.path.join( - dataset, "testandpredict_groupby.csv"), index=False) + config.dataset,"testandpredict_groupby.csv"), index=False) # 历史价格+预测价格 sqlitedb.drop_table('testandpredict_groupby') @@ -836,7 +836,7 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): plt.xlabel('日期') plt.ylabel('价格') - plt.savefig(os.path.join(dataset, '历史价格-预测值.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '历史价格-预测值.png'), bbox_inches='tight') plt.close() def _plt_modeltopten_predict_ture(df): @@ -870,7 +870,7 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): plt.xlabel('日期') plt.ylabel('价格') - plt.savefig(os.path.join(dataset, '历史价格-预测值1.png'), + plt.savefig(os.path.join(config.dataset, '历史价格-预测值1.png'), bbox_inches='tight') plt.close() @@ -891,7 +891,7 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset, '预测值表格.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '预测值表格.png'), bbox_inches='tight') plt.close() def _plt_model_results3(): @@ -906,7 +906,7 @@ def model_losss_yongan(sqlitedb, end_time, table_name_prefix): table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset, '模型评估.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '模型评估.png'), bbox_inches='tight') plt.close() _plt_predict_ture(df_combined3) @@ -1405,19 +1405,19 @@ def model_losss_juxitingbak(sqlitedb, end_time): most_model_name = most_model[0] # 预测数据处理 predict - df_combined = loadcsv(os.path.join(dataset, "cross_validation.csv")) + df_combined = loadcsv(os.path.join(config.dataset, "cross_validation.csv")) df_combined.drop(columns=['cutoff'], inplace=True) df_combined['CREAT_DATE'] = end_time df_combined = dateConvert(df_combined) # df_combined = sqlitedb.select_data('accuracy',where_condition=f"created_dt <= '{end_time}'") df_combined4 = df_combined.copy() # 备份df_combined,后面画图需要 # 删除缺失值大于80%的列 - logger.info(df_combined.shape) + config.logger.info(df_combined.shape) df_combined = df_combined.loc[:, df_combined.isnull().mean() < 0.8] - logger.info(df_combined.shape) + config.logger.info(df_combined.shape) # 删除缺失值 df_combined.dropna(inplace=True) - logger.info(df_combined.shape) + config.logger.info(df_combined.shape) # 其他列转为数值类型 df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in [ 'CREAT_DATE', 'ds', 'created_dt']}) @@ -1457,7 +1457,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): model_results3 = model_results3.sort_values( by='平均平方误差(MSE)', ascending=True) model_results3.to_csv(os.path.join( - dataset, "model_evaluation.csv"), index=False) + config.dataset,"model_evaluation.csv"), index=False) modelnames = model_results3['模型(Model)'].tolist() allmodelnames = modelnames.copy() # 保存5个最佳模型的名称 @@ -1466,7 +1466,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): if is_fivemodels: pass else: - with open(os.path.join(dataset, "best_modelnames.txt"), 'w') as f: + with open(os.path.join(config.dataset, "best_modelnames.txt"), 'w') as f: f.write(','.join(modelnames) + '\n') # 预测值与真实值对比图 @@ -1481,12 +1481,12 @@ def model_losss_juxitingbak(sqlitedb, end_time): plt.ylabel('价格') plt.title(model+'拟合') plt.subplots_adjust(hspace=0.5) - plt.savefig(os.path.join(dataset, '预测值与真实值对比图.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '预测值与真实值对比图.png'), bbox_inches='tight') plt.close() # # 历史数据+预测数据 # # 拼接未来时间预测 - df_predict = pd.read_csv(os.path.join(dataset, 'predict.csv')) + df_predict = pd.read_csv(os.path.join(config.dataset, 'predict.csv')) df_predict.drop('unique_id', inplace=True, axis=1) df_predict.dropna(axis=1, inplace=True) @@ -1634,7 +1634,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): sqlitedb.update_data( 'accuracy', f"min_price = {row['min_price'].values[0]},max_price = {row['max_price'].values[0]},mean={row['mean'].values[0]}", f"id = {id}") except: - logger.error(f'更新accuracy表中的min_price,max_price,mean值失败,row={row}') + config.logger.error(f'更新accuracy表中的min_price,max_price,mean值失败,row={row}') df = accuracy_df.copy() df['ds'] = pd.to_datetime(df['ds']) @@ -1723,7 +1723,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): except ValueError: pass df_combined3.to_csv(os.path.join( - dataset, "testandpredict_groupby.csv"), index=False) + config.dataset,"testandpredict_groupby.csv"), index=False) # 历史价格+预测价格 sqlitedb.drop_table('testandpredict_groupby') @@ -1767,7 +1767,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): plt.xticks(rotation=45) # 日期标签旋转45度,防止重叠 plt.ylabel('价格') - plt.savefig(os.path.join(dataset, '历史价格-预测值.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '历史价格-预测值.png'), bbox_inches='tight') plt.close() def _plt_modeltopten_predict_ture(df): @@ -1806,7 +1806,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): plt.ylabel('价格') - plt.savefig(os.path.join(dataset, '历史价格-预测值1.png'), + plt.savefig(os.path.join(config.dataset, '历史价格-预测值1.png'), bbox_inches='tight') plt.close() @@ -1827,7 +1827,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset, '预测值表格.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '预测值表格.png'), bbox_inches='tight') plt.close() def _plt_model_results3(): @@ -1842,7 +1842,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset, '模型评估.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '模型评估.png'), bbox_inches='tight') plt.close() _plt_predict_ture(df_combined3) @@ -1855,7 +1855,7 @@ def model_losss_juxitingbak(sqlitedb, end_time): # 聚烯烃计算预测评估指数 @exception_logger -def model_losss_juxiting(sqlitedb): +def model_losss_juxiting(sqlitedb,end_time,is_fivemodels): global dataset global rote most_model = [sqlitedb.select_data('most_model', columns=[ @@ -1863,7 +1863,7 @@ def model_losss_juxiting(sqlitedb): most_model_name = most_model[0] # 预测数据处理 predict - df_combined = loadcsv(os.path.join(dataset, "cross_validation.csv")) + df_combined = loadcsv(os.path.join(config.dataset, "cross_validation.csv")) df_combined = dateConvert(df_combined) # 删除空列 df_combined.dropna(axis=1, inplace=True) @@ -1909,7 +1909,7 @@ def model_losss_juxiting(sqlitedb): model_results3 = model_results3.sort_values( by='平均平方误差(MSE)', ascending=True) model_results3.to_csv(os.path.join( - dataset, "model_evaluation.csv"), index=False) + config.dataset, "model_evaluation.csv"), index=False) modelnames = model_results3['模型(Model)'].tolist() allmodelnames = modelnames.copy() # 保存5个最佳模型的名称 @@ -1918,7 +1918,7 @@ def model_losss_juxiting(sqlitedb): if is_fivemodels: pass else: - with open(os.path.join(dataset, "best_modelnames.txt"), 'w') as f: + with open(os.path.join(config.dataset, "best_modelnames.txt"), 'w') as f: f.write(','.join(modelnames) + '\n') # 预测值与真实值对比图 @@ -1933,12 +1933,12 @@ def model_losss_juxiting(sqlitedb): plt.ylabel('价格') plt.title(model+'拟合') plt.subplots_adjust(hspace=0.5) - plt.savefig(os.path.join(dataset, '预测值与真实值对比图.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '预测值与真实值对比图.png'), bbox_inches='tight') plt.close() # # 历史数据+预测数据 # # 拼接未来时间预测 - df_predict = pd.read_csv(os.path.join(dataset, 'predict.csv')) + df_predict = pd.read_csv(os.path.join(config.dataset, 'predict.csv')) df_predict.drop('unique_id', inplace=True, axis=1) df_predict.dropna(axis=1, inplace=True) @@ -1990,7 +1990,7 @@ def model_losss_juxiting(sqlitedb): def add_rote_column(row): columns = [] for r in names_df.columns: - if row[r] <= rote: + if row[r] <= config.rote: columns.append(r.split('-')[0]) return pd.Series([columns], index=['columns']) names_df['columns'] = names_df.apply(add_rote_column, axis=1) @@ -2061,7 +2061,7 @@ def model_losss_juxiting(sqlitedb): except ValueError: pass df_combined3.to_csv(os.path.join( - dataset, "testandpredict_groupby.csv"), index=False) + config.dataset,"testandpredict_groupby.csv"), index=False) # 历史价格+预测价格 sqlitedb.drop_table('testandpredict_groupby') @@ -2092,12 +2092,12 @@ def model_losss_juxiting(sqlitedb): for model in most_model: plt.plot(df['ds'], df[model], label=model, marker='o') # 当前日期画竖虚线 - plt.axvline(x=df['ds'].iloc[-horizon], color='r', linestyle='--') + plt.axvline(x=df['ds'].iloc[-config.horizon], color='r', linestyle='--') plt.legend() plt.xlabel('日期') plt.ylabel('价格') - plt.savefig(os.path.join(dataset, '历史价格-预测值.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '历史价格-预测值.png'), bbox_inches='tight') plt.close() def _plt_predict_table(df): @@ -2106,8 +2106,8 @@ def model_losss_juxiting(sqlitedb): ax.axis('off') # 关闭坐标轴 # 数值保留2位小数 df = df.round(2) - df = df[-horizon:] - df['Day'] = [f'Day_{i}' for i in range(1, horizon+1)] + df = df[-config.horizon:] + df['Day'] = [f'Day_{i}' for i in range(1, config.horizon+1)] # Day列放到最前面 df = df[['Day'] + list(df.columns[:-1])] table = ax.table(cellText=df.values, @@ -2117,7 +2117,7 @@ def model_losss_juxiting(sqlitedb): table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset, '预测值表格.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '预测值表格.png'), bbox_inches='tight') plt.close() def _plt_model_results3(): @@ -2132,7 +2132,7 @@ def model_losss_juxiting(sqlitedb): table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset, '模型评估.png'), bbox_inches='tight') + plt.savefig(os.path.join(config.dataset, '模型评估.png'), bbox_inches='tight') plt.close() _plt_predict_ture(df_combined3) @@ -2150,7 +2150,7 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in # 获取特征的近一月值 import pandas as pd feature_data_df = pd.read_csv(os.path.join( - dataset, '指标数据添加时间特征.csv'), parse_dates=['ds']).tail(60) + config.dataset,'指标数据添加时间特征.csv'), parse_dates=['ds']).tail(60) def draw_feature_trend(feature_data_df, features): # 画特征近60天的趋势图 @@ -2198,9 +2198,9 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in col = col.replace('*', '-') col = col.replace(':', '-') col = col.replace(r'/', '-') - plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + plt.savefig(os.path.join(config.dataset, f'{col}与价格散点图.png')) content.append(Graphs.draw_img( - os.path.join(dataset, f'{col}与价格散点图.png'))) + os.path.join(config.dataset, f'{col}与价格散点图.png'))) plt.close() # except Exception as e: # print(f'绘制第{i+1}个特征{col}与价格散点图时出错:{e}') @@ -2211,26 +2211,26 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in # 预测结果 content.append(Graphs.draw_little_title('一、预测结果:')) # 添加历史走势及预测价格的走势图片 - content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '历史价格-预测值.png'))) # 波动率画图逻辑 content.append(Graphs.draw_text('图示说明:')) content.append(Graphs.draw_text( ' 确定置信区间:设置残差置信阈值,以每周最佳模型为基准,选取在置信区间的预测值作为置信区间;')) # 添加历史走势及预测价格的走势图片 - content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值1.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '历史价格-预测值1.png'))) content.append(Graphs.draw_text('图示说明:')) content.append(Graphs.draw_text( ' 确定置信区间:使用模型评估指标MAE得到前十个模型,取平均值上下1.5作为价格波动置信区间;')) # 取df中y列为空的行 import pandas as pd - df = pd.read_csv(os.path.join(dataset, 'predict.csv'), encoding='gbk') + df = pd.read_csv(os.path.join(config.dataset, 'predict.csv'), encoding='gbk') df_true = pd.read_csv(os.path.join( - dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 + config.dataset,'指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 df_true = df_true[['ds', 'y']] eval_df = pd.read_csv(os.path.join( - dataset, 'model_evaluation.csv'), encoding='utf-8') + config.dataset,'model_evaluation.csv'), encoding='utf-8') # 按评估指标排序,取前五 fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用 # 取 fivemodels_list 和 ds 列 @@ -2258,7 +2258,7 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) df = pd.read_csv(os.path.join( - dataset, 'testandpredict_groupby.csv'), encoding='utf-8') + config.dataset,'testandpredict_groupby.csv'), encoding='utf-8') df4 = df.copy() # 计算偏差率使用 # 去掉created_dt 列 df4 = df4.drop(columns=['created_dt']) @@ -2303,14 +2303,14 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。')) content.append(Graphs.draw_little_title('指标情况:')) - with open(os.path.join(dataset, '特征频度统计.txt'), encoding='utf-8') as f: + with open(os.path.join(config.dataset, '特征频度统计.txt'), encoding='utf-8') as f: for line in f.readlines(): content.append(Graphs.draw_text(line)) - data = pd.read_csv(os.path.join(dataset, '指标数据添加时间特征.csv'), + data = pd.read_csv(os.path.join(config.dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 计算相关系数用 df_zhibiaofenlei = loadcsv(os.path.join( - dataset, '特征处理后的指标名称及分类.csv')) # 气泡图用 + config.dataset,'特征处理后的指标名称及分类.csv')) # 气泡图用 df_zhibiaoshuju = data.copy() # 气泡图用 # 绘制特征相关气泡图 @@ -2342,10 +2342,10 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in plt.xlabel('相关系数') plt.ylabel('频数') plt.savefig(os.path.join( - dataset, f'{name}类指标相关性直方分布图.png'), bbox_inches='tight') + config.dataset,f'{name}类指标相关性直方分布图.png'), bbox_inches='tight') plt.close() content.append(Graphs.draw_img( - os.path.join(dataset, f'{name}类指标相关性直方分布图.png'))) + os.path.join(config.dataset, f'{name}类指标相关性直方分布图.png'))) content.append(Graphs.draw_text( f'{name}类指标(共{len(cols_subset)}个)的相关性直方分布图如上所示。')) # 相关性大于0的特征 @@ -2398,10 +2398,10 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in grouped_corr['相关性总和'].min()*5, grouped_corr['相关性总和'].max()*5), hue='指标分类', palette='viridis') plt.title('指标分类相关性总和的气泡图') plt.ylabel('数量') - plt.savefig(os.path.join(dataset, '指标分类相关性总和的气泡图.png'), + plt.savefig(os.path.join(config.dataset, '指标分类相关性总和的气泡图.png'), bbox_inches='tight') plt.close() - content.append(Graphs.draw_img(os.path.join(dataset, '指标分类相关性总和的气泡图.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '指标分类相关性总和的气泡图.png'))) content.append(Graphs.draw_text( '气泡图中,横轴为指标分类,纵轴为指标分类下的特征数量,气泡的面积越大表示该分类中特征的相关系数和越大。')) config.logger.info(f'绘制相关性总和的气泡图结束') @@ -2409,7 +2409,7 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in content.append(Graphs.draw_text( f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) # 读取模型简介 - with open(os.path.join(dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: + with open(os.path.join(config.dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: for line in f: line_split = line.strip().split('--') if line_split[0] in fivemodels_list: @@ -2417,7 +2417,7 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in content.append(Graphs.draw_text(introduction)) content.append(Graphs.draw_little_title('模型评估:')) df = pd.read_csv(os.path.join( - dataset, 'model_evaluation.csv'), encoding='utf-8') + config.dataset,'model_evaluation.csv'), encoding='utf-8') # 判断 df 的数值列转为float for col in eval_df.columns: if col not in ['模型(Model)']: @@ -2443,14 +2443,14 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in '3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) content.append(Graphs.draw_text('模型拟合:')) # 添加图片 - content.append(Graphs.draw_img(os.path.join(dataset, '预测值与真实值对比图.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '预测值与真实值对比图.png'))) # 生成pdf文件 - doc = SimpleDocTemplate(os.path.join(dataset, reportname), pagesize=letter) + doc = SimpleDocTemplate(os.path.join(config.dataset, reportname), pagesize=letter) doc.build(content) # pdf 上传到数字化信息平台 try: if config.is_update_report: - with open(os.path.join(dataset, reportname), 'rb') as f: + with open(os.path.join(config.dataset, reportname), 'rb') as f: base64_data = base64.b64encode(f.read()).decode('utf-8') upload_data["data"]["fileBase64"] = base64_data upload_data["data"]["fileName"] = reportname @@ -2462,13 +2462,12 @@ def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, in @exception_logger def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, inputsize=5, dataset='dataset', time='2024-07-30', reportname='report.pdf', sqlitedb='jbsh_yuanyou.db'): - global y # 创建内容对应的空列表 content = list() # 获取特征的近一月值 import pandas as pd feature_data_df = pd.read_csv(os.path.join( - dataset, '指标数据添加时间特征.csv'), parse_dates=['ds']).tail(20) + config.dataset,'指标数据添加时间特征.csv'), parse_dates=['ds']).tail(20) def draw_feature_trend(feature_data_df, features): # 画特征近一周的趋势图 @@ -2516,28 +2515,28 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input col = col.replace('*', '-') col = col.replace(':', '-') col = col.replace(r'/', '-') - plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + plt.savefig(os.path.join(config.dataset, f'{col}与价格散点图.png')) content.append(Graphs.draw_img( - os.path.join(dataset, f'{col}与价格散点图.png'))) + os.path.join(config.dataset, f'{col}与价格散点图.png'))) plt.close() # except Exception as e: # print(f'绘制第{i+1}个特征{col}与价格散点图时出错:{e}') # 添加标题 - content.append(Graphs.draw_title(f'{y}{time}预测报告')) + content.append(Graphs.draw_title(f'{config.y}{time}预测报告')) # 预测结果 content.append(Graphs.draw_little_title('一、预测结果:')) # 添加历史走势及预测价格的走势图片 - content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '历史价格-预测值.png'))) # 取df中y列为空的行 import pandas as pd - df = pd.read_csv(os.path.join(dataset, 'predict.csv'), encoding='gbk') + df = pd.read_csv(os.path.join(config.dataset, 'predict.csv'), encoding='gbk') df_true = pd.read_csv(os.path.join( - dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 + config.dataset,'指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 df_true = df_true[['ds', 'y']] eval_df = pd.read_csv(os.path.join( - dataset, 'model_evaluation.csv'), encoding='utf-8') + config.dataset,'model_evaluation.csv'), encoding='utf-8') # 按评估指标排序,取前五 fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用 # 取 fivemodels_list 和 ds 列 @@ -2565,7 +2564,7 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) df = pd.read_csv(os.path.join( - dataset, 'testandpredict_groupby.csv'), encoding='utf-8') + config.dataset,'testandpredict_groupby.csv'), encoding='utf-8') df4 = df.copy() # 计算偏差率使用 # 计算模型偏差率 # 计算各列对于y列的差值百分比 @@ -2601,14 +2600,14 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。')) content.append(Graphs.draw_little_title('指标情况:')) - with open(os.path.join(dataset, '特征频度统计.txt'), encoding='utf-8') as f: + with open(os.path.join(config.dataset, '特征频度统计.txt'), encoding='utf-8') as f: for line in f.readlines(): content.append(Graphs.draw_text(line)) - data = pd.read_csv(os.path.join(dataset, '指标数据添加时间特征.csv'), + data = pd.read_csv(os.path.join(config.dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 计算相关系数用 df_zhibiaofenlei = loadcsv(os.path.join( - dataset, '特征处理后的指标名称及分类.csv')) # 气泡图用 + config.dataset,'特征处理后的指标名称及分类.csv')) # 气泡图用 df_zhibiaoshuju = data.copy() # 气泡图用 # 绘制特征相关气泡图 @@ -2627,7 +2626,7 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input '''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) for name, group in grouped: cols = group['指标名称'].tolist() - logger.info(f'开始绘制{name}类指标的相关性直方图') + config.logger.info(f'开始绘制{name}类指标的相关性直方图') cols_subset = cols feature_names = ['y'] + cols_subset correlation_matrix = df_zhibiaoshuju[feature_names].corr()['y'] @@ -2640,10 +2639,10 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input plt.xlabel('相关系数') plt.ylabel('频数') plt.savefig(os.path.join( - dataset, f'{name}类指标相关性直方分布图.png'), bbox_inches='tight') + config.dataset,f'{name}类指标相关性直方分布图.png'), bbox_inches='tight') plt.close() content.append(Graphs.draw_img( - os.path.join(dataset, f'{name}类指标相关性直方分布图.png'))) + os.path.join(config.dataset, f'{name}类指标相关性直方分布图.png'))) content.append(Graphs.draw_text( f'{name}类指标(共{len(cols_subset)}个)的相关性直方分布图如上所示。')) # 相关性大于0的特征 @@ -2683,7 +2682,7 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input # 计算correlation_sum 第一行的相关性的绝对值的总和 correlation_sum = correlation_matrix.abs().sum() - logger.info(f'{name}类指标的相关性总和为:{correlation_sum}') + config.logger.info(f'{name}类指标的相关性总和为:{correlation_sum}') # 分组的相关性总和拼接到grouped_corr goup_corr = pd.DataFrame( {'指标分类': [name], '指标数量': [len(cols_subset)], '相关性总和': [correlation_sum]}) @@ -2691,26 +2690,26 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input [grouped_corr, goup_corr], axis=0, ignore_index=True) # 绘制相关性总和的气泡图 - logger.info(f'开始绘制相关性总和的气泡图') + config.logger.info(f'开始绘制相关性总和的气泡图') plt.figure(figsize=(10, 10)) sns.scatterplot(data=grouped_corr, x='相关性总和', y='指标数量', size='相关性总和', sizes=( grouped_corr['相关性总和'].min()*5, grouped_corr['相关性总和'].max()*5), hue='指标分类', palette='viridis') plt.title('指标分类相关性总和的气泡图') plt.ylabel('数量') - plt.savefig(os.path.join(dataset, '指标分类相关性总和的气泡图.png'), + plt.savefig(os.path.join(config.dataset, '指标分类相关性总和的气泡图.png'), bbox_inches='tight') plt.close() - content.append(Graphs.draw_img(os.path.join(dataset, '指标分类相关性总和的气泡图.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '指标分类相关性总和的气泡图.png'))) content.append(Graphs.draw_text( '气泡图中,横轴为指标分类,纵轴为指标分类下的特征数量,气泡的面积越大表示该分类中特征的相关系数和越大。')) - logger.info(f'绘制相关性总和的气泡图结束') + config.logger.info(f'绘制相关性总和的气泡图结束') content.append(Graphs.draw_little_title('模型选择:')) content.append(Graphs.draw_text( f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) # 读取模型简介 - with open(os.path.join(dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: + with open(os.path.join(config.dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: for line in f: line_split = line.strip().split('--') if line_split[0] in fivemodels_list: @@ -2720,7 +2719,7 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input content.append(Graphs.draw_little_title('模型评估:')) df = pd.read_csv(os.path.join( - dataset, 'model_evaluation.csv'), encoding='utf-8') + config.dataset,'model_evaluation.csv'), encoding='utf-8') # 判断 df 的数值列转为float for col in eval_df.columns: if col not in ['模型(Model)']: @@ -2742,12 +2741,12 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input content.append(Graphs.draw_text( '3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) content.append(Graphs.draw_text('模型拟合:')) - content.append(Graphs.draw_img(os.path.join(dataset, '预测值与真实值对比图.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '预测值与真实值对比图.png'))) # 附1,特征列表 content.append(Graphs.draw_little_title('附1、特征列表:')) df_fuyi = pd.read_csv(os.path.join( - dataset, '特征频度统计.csv'), encoding='utf-8') + config.dataset,'特征频度统计.csv'), encoding='utf-8') for col in df_fuyi.columns: fuyi = df_fuyi[col] fuyi = fuyi.dropna() @@ -2756,18 +2755,18 @@ def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, input content.append(Graphs.draw_text(f'{i+1}、{fuyi[i]}')) # 生成pdf文件 - doc = SimpleDocTemplate(os.path.join(dataset, reportname), pagesize=letter) - # doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter) + doc = SimpleDocTemplate(os.path.join(config.dataset, reportname), pagesize=letter) + # doc = SimpleDocTemplate(os.path.join(config.dataset,'reportname.pdf'), pagesize=letter) doc.build(content) # pdf 上传到数字化信息平台 try: - if is_update_report: - with open(os.path.join(dataset, reportname), 'rb') as f: + if config.is_update_report: + with open(os.path.join(config.dataset, reportname), 'rb') as f: base64_data = base64.b64encode(f.read()).decode('utf-8') - upload_data["data"]["fileBase64"] = base64_data - upload_data["data"]["fileName"] = reportname + global_config['upload_data']["data"]["fileBase64"] = base64_data + global_config['upload_data']["data"]["fileName"] = reportname token = get_head_auth_report() - upload_report_data(token, upload_data) + upload_report_data(token, config.upload_data) except TimeoutError as e: print(f"请求超时: {e}") @@ -2785,19 +2784,19 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in # 添加图片 # 找出后缀是历史价格-预测值.png的图片 # import glob - # imgs = glob.glob(os.path.join(dataset,'*历史价格-预测值.png')) + # imgs = glob.glob(os.path.join(config.dataset,'*历史价格-预测值.png')) # for img in imgs: # content.append(Graphs.draw_img(img)) - content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '历史价格-预测值.png'))) # 取df中y列为空的行 import pandas as pd - df = pd.read_csv(os.path.join(dataset, 'predict.csv'), encoding='gbk') + df = pd.read_csv(os.path.join(config.dataset, 'predict.csv'), encoding='gbk') df_true = pd.read_csv(os.path.join( - dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 + config.dataset,'指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 df_true = df_true[['ds', 'y']] eval_df = pd.read_csv(os.path.join( - dataset, 'model_evaluation.csv'), encoding='utf-8') + config.dataset,'model_evaluation.csv'), encoding='utf-8') # 按评估指标排序,取前五 fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用 # 取 fivemodels_list 和 ds 列 @@ -2825,7 +2824,7 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) df = pd.read_csv(os.path.join( - dataset, 'testandpredict_groupby.csv'), encoding='utf-8') + config.dataset,'testandpredict_groupby.csv'), encoding='utf-8') df4 = df.copy() # 计算偏差率使用 # 计算模型偏差率 # 计算各列对于y列的差值百分比 @@ -2860,7 +2859,7 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。')) content.append(Graphs.draw_little_title('指标情况:')) - with open(os.path.join(dataset, '特征频度统计.txt'), encoding='utf-8') as f: + with open(os.path.join(config.dataset, '特征频度统计.txt'), encoding='utf-8') as f: for line in f.readlines(): content.append(Graphs.draw_text(line)) @@ -2869,7 +2868,7 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in # 读取数据 from scipy.stats import spearmanr data = pd.read_csv(os.path.join( - dataset, '指标数据添加时间特征.csv'), encoding='utf-8') + config.dataset,'指标数据添加时间特征.csv'), encoding='utf-8') # 重命名预测列 data.rename(columns={y: 'y'}, inplace=True) # 修改 data['ds'] = pd.to_datetime(data['ds']) # 修改 @@ -2889,7 +2888,7 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in # 删除空列 correlation_df.drop('Correlation', axis=1, inplace=True) correlation_df.dropna(inplace=True) - correlation_df.to_csv(os.path.join(dataset, '指标相关性分析.csv'), index=False) + correlation_df.to_csv(os.path.join(config.dataset, '指标相关性分析.csv'), index=False) data = correlation_df['Pearson_Correlation'].values.tolist() # 生成 -1 到 1 的 20 个区间 @@ -2907,7 +2906,7 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in plt.title('皮尔逊相关系数分布图') plt.xlabel('区间') plt.ylabel('统计数') - plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png')) + plt.savefig(os.path.join(config.dataset, '皮尔逊相关性系数.png')) plt.close() # 设置画布大小 @@ -2924,11 +2923,11 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in plt.title('斯皮尔曼相关系数分布图') plt.xlabel('区间') plt.ylabel('统计数') - plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png')) + plt.savefig(os.path.join(config.dataset, '斯皮尔曼相关性系数.png')) plt.close() content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:')) # 皮尔逊正相关 不相关 负相关 的表格 - content.append(Graphs.draw_img(os.path.join(dataset, '皮尔逊相关性系数.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '皮尔逊相关性系数.png'))) content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) content.append(Graphs.draw_text(''' @@ -2940,7 +2939,7 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in content.append(Graphs.draw_text(f'''{top10}''')) # 获取特征的近一月值 feature_data_df = pd.read_csv(os.path.join( - dataset, '填充后的特征数据.csv'), parse_dates=['ds']).tail(20) + config.dataset,'填充后的特征数据.csv'), parse_dates=['ds']).tail(20) feature_df = feature_data_df[['ds', 'y']+top10_columns] # feature_df['ds'] = pd.to_datetime(df['ds'], format = '%Y-%m-%d' ) # 遍历X每一列,和yy画散点图 , @@ -2983,9 +2982,9 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in # 文件名特殊字符处理 col = col.replace('*', '-') col = col.replace(':', '-') - plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + plt.savefig(os.path.join(config.dataset, f'{col}与价格散点图.png')) content.append(Graphs.draw_img( - os.path.join(dataset, f'{col}与价格散点图.png'))) + os.path.join(config.dataset, f'{col}与价格散点图.png'))) plt.close() content.append(Graphs.draw_text( @@ -3038,15 +3037,15 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in # 文件名特殊字符处理 col = col.replace('*', '-') col = col.replace(':', '-') - plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + plt.savefig(os.path.join(config.dataset, f'{col}与价格散点图.png')) content.append(Graphs.draw_img( - os.path.join(dataset, f'{col}与价格散点图.png'))) + os.path.join(config.dataset, f'{col}与价格散点图.png'))) plt.close() content.append(Graphs.draw_text( '''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:')) # 皮尔逊正相关 不相关 负相关 的表格 - content.append(Graphs.draw_img(os.path.join(dataset, '斯皮尔曼相关性系数.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '斯皮尔曼相关性系数.png'))) content.append(Graphs.draw_text( '斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。')) @@ -3069,7 +3068,7 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) # 读取模型简介 - with open(os.path.join(dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: + with open(os.path.join(config.dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: for line in f: line_split = line.strip().split('--') if line_split[0] in fivemodels_list: @@ -3079,7 +3078,7 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in content.append(Graphs.draw_little_title('模型评估:')) df = pd.read_csv(os.path.join( - dataset, 'model_evaluation.csv'), encoding='utf-8') + config.dataset,'model_evaluation.csv'), encoding='utf-8') # 判断 df 的数值列转为float for col in eval_df.columns: if col not in ['模型(Model)']: @@ -3105,12 +3104,12 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in '3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) content.append(Graphs.draw_text('模型拟合:')) # 添加图片 - content.append(Graphs.draw_img(os.path.join(dataset, '预测值与真实值对比图.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '预测值与真实值对比图.png'))) # 附1,特征列表 content.append(Graphs.draw_little_title('附1、特征列表:')) df_fuyi = pd.read_csv(os.path.join( - dataset, '特征频度统计.csv'), encoding='utf-8') + config.dataset,'特征频度统计.csv'), encoding='utf-8') for col in df_fuyi.columns: fuyi = df_fuyi[col] fuyi = fuyi.dropna() @@ -3119,14 +3118,14 @@ def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, in content.append(Graphs.draw_text(f'{i+1}、{fuyi[i]}')) # 生成pdf文件 - doc = SimpleDocTemplate(os.path.join(dataset, reportname), pagesize=letter) - # doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter) + doc = SimpleDocTemplate(os.path.join(config.dataset, reportname), pagesize=letter) + # doc = SimpleDocTemplate(os.path.join(config.dataset,'reportname.pdf'), pagesize=letter) doc.build(content) # pdf 上传到数字化信息平台 # 读取pdf并转为base64 try: if is_update_report: - with open(os.path.join(dataset, reportname), 'rb') as f: + with open(os.path.join(config.dataset, reportname), 'rb') as f: base64_data = base64.b64encode(f.read()).decode('utf-8') upload_data["data"]["fileBase64"] = base64_data upload_data["data"]["fileName"] = reportname @@ -3144,13 +3143,13 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 content.append(Graphs.draw_title(f'{y}{end_time}预测报告')) # 预测结果 content.append(Graphs.draw_little_title('一、预测结果:')) - content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '历史价格-预测值.png'))) # 取df中y列为空的行 from lib.dataread import loadcsv - df = loadcsv(os.path.join(dataset, 'predict.csv')) - df_true = loadcsv(os.path.join(dataset, '指标数据添加时间特征.csv')) # 获取预测日期对应的真实值 + df = loadcsv(os.path.join(config.dataset, 'predict.csv')) + df_true = loadcsv(os.path.join(config.dataset, '指标数据添加时间特征.csv')) # 获取预测日期对应的真实值 df_true = df_true[['ds', 'y']] - eval_df = loadcsv(os.path.join(dataset, 'model_evaluation.csv')) + eval_df = loadcsv(os.path.join(config.dataset, 'model_evaluation.csv')) # 按评估指标排序,取前五 fivemodels_list = eval_df['模型(Model)'].values[:5] # 列表形式,后面当作列名索引使用 # 取 fivemodels_list 和 ds 列 @@ -3190,7 +3189,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 col_width = 500/len(df.columns) content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) - df = loadcsv(os.path.join(dataset, 'testandpredict_groupby.csv')) + df = loadcsv(os.path.join(config.dataset, 'testandpredict_groupby.csv')) df4 = df.copy() # 计算偏差率使用 # 计算模型偏差率 # 计算各列对于y列的差值百分比 @@ -3226,7 +3225,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 content.append(Graphs.draw_little_title('指标情况:')) content.append(Graphs.draw_text(' 指标频度包括')) # 添加频度统计表格 - pindu_df = loadcsv(os.path.join(dataset, '特征频度统计.csv')) + pindu_df = loadcsv(os.path.join(config.dataset, '特征频度统计.csv')) pindu_df.fillna('-', inplace=True) pindu_df = pindu_df.T pindu_df = pindu_df.reset_index() @@ -3251,8 +3250,8 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 # 特征工程 # 预测列分析 content.append(Graphs.draw_text(' 电碳价格自相关ACF和偏自相关PACF分析:')) - content.append(Graphs.draw_img(os.path.join(dataset, '指标数据自相关图.png'))) - content.append(Graphs.draw_img(os.path.join(dataset, '指标数据偏自相关图.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '指标数据自相关图.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '指标数据偏自相关图.png'))) content.append(Graphs.draw_text(' 解读:')) content.append(Graphs.draw_text( ' 自相关函数的取值范围为 [-1, 1]。正值表示信号在不同时间点之间具有正相关性,负值表示信号具有负相关性,而 0 表示信号在不同时间点之间不相关。 ')) @@ -3263,7 +3262,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 content.append(Graphs.draw_text(' 数据特征可视化分析:')) # 找出所有后缀为散点图.png的文件 import glob - scatter_files = glob.glob(os.path.join(dataset, '*散点图.png')) + scatter_files = glob.glob(os.path.join(config.dataset, '*散点图.png')) for file in scatter_files: content.append(Graphs.draw_img(file)) content.append(Graphs.draw_text(' 解读:')) @@ -3273,7 +3272,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 # 计算特征相关性 # 读取数据 from scipy.stats import spearmanr - data = loadcsv(os.path.join(dataset, '指标数据添加时间特征.csv')) + data = loadcsv(os.path.join(config.dataset, '指标数据添加时间特征.csv')) # 重命名预测列 data.rename(columns={y: 'y'}, inplace=True) # 修改 from lib.tools import dateConvert @@ -3294,7 +3293,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 # 删除空列 correlation_df.drop('Correlation', axis=1, inplace=True) correlation_df.dropna(inplace=True) - correlation_df.to_csv(os.path.join(dataset, '指标相关性分析.csv'), index=False) + correlation_df.to_csv(os.path.join(config.dataset, '指标相关性分析.csv'), index=False) data = correlation_df['Pearson_Correlation'].values.tolist() # 生成 -1 到 1 的 20 个区间 bins = np.linspace(-1, 1, 21) @@ -3309,7 +3308,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 plt.title('皮尔逊相关系数分布图') plt.xlabel('区间') plt.ylabel('统计数') - plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png')) + plt.savefig(os.path.join(config.dataset, '皮尔逊相关性系数.png')) plt.close() # 设置画布大小 plt.figure(figsize=(10, 6)) @@ -3323,11 +3322,11 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 plt.title('斯皮尔曼相关系数分布图') plt.xlabel('区间') plt.ylabel('统计数') - plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png')) + plt.savefig(os.path.join(config.dataset, '斯皮尔曼相关性系数.png')) plt.close() content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:')) # 皮尔逊正相关 不相关 负相关 的表格 - content.append(Graphs.draw_img(os.path.join(dataset, '皮尔逊相关性系数.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '皮尔逊相关性系数.png'))) content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) content.append(Graphs.draw_text(''' @@ -3346,7 +3345,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 '''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:')) # 皮尔逊正相关 不相关 负相关 的表格 - content.append(Graphs.draw_img(os.path.join(dataset, '斯皮尔曼相关性系数.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '斯皮尔曼相关性系数.png'))) content.append(Graphs.draw_text( '斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。')) @@ -3368,7 +3367,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,模型的简介如下:')) # 读取模型简介 - with open(os.path.join(dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: + with open(os.path.join(config.dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: for line in f: line_split = line.strip().split('--') # if line_split[0] in fivemodels_list: @@ -3377,7 +3376,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 content.append(Graphs.draw_little_title('模型评估:')) content.append(Graphs.draw_text(f'通过评估指标MAE从小到大排列,前5个模型的评估详情如下:')) - df = loadcsv(os.path.join(dataset, 'model_evaluation.csv')) + df = loadcsv(os.path.join(config.dataset, 'model_evaluation.csv')) # 判断 df 的数值列转为float for col in eval_df.columns: if col not in ['模型(Model)']: @@ -3403,7 +3402,7 @@ def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202 '3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值之差的平方,然后对这些平方差求平均值。取值越小,误差越小,预测效果越好。')) content.append(Graphs.draw_text('模型拟合:')) # 添加图片 - content.append(Graphs.draw_img(os.path.join(dataset, '预测值与真实值对比图.png'))) + content.append(Graphs.draw_img(os.path.join(config.dataset, '预测值与真实值对比图.png'))) # 生成pdf文件 - doc = SimpleDocTemplate(os.path.join(dataset, reportname), pagesize=letter) + doc = SimpleDocTemplate(os.path.join(config.dataset, reportname), pagesize=letter) doc.build(content)