PriceForecast/原油预测值绘图调试 copy.ipynb
2024-11-20 14:14:27 +08:00

496 lines
22 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "7fadc60c-d710-4b8c-89cd-1d889ece1eaf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"从eta获取数据...\n",
"跳过指标 美国:东海岸地区:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:洛基山地区:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:墨西哥湾沿岸:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:西海岸地区:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:中西部地区:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 中国航班执行数/7DMA\n",
"跳过指标 美国汽油表需周度1周环差负值\n",
"跳过指标 美国汽油表需周度1周环差\n",
"跳过指标 美国汽油产量周度1周环差\n",
"跳过指标 美国原油周度表需/4WMA\n",
"跳过指标 美国油品表需/4WMA\n",
"跳过指标 美国油品表需4周环差\n",
"跳过指标 道琼斯旅游与休闲/标普500\n",
"跳过指标 DOE-柴油产量1周环差\n",
"跳过指标 DOE-美国汽油产量1周环差\n",
"跳过指标 美国航煤表需一周环差负值\n",
"跳过指标 美国馏分油表需一周环差负值\n",
"跳过指标 美国汽油表需一周环差负值\n",
"跳过指标 欧洲汽油裂差\n",
"跳过指标 欧洲柴油裂差\n",
"跳过指标 中国主营炼厂产能利用率1周环差\n",
"跳过指标 美国零售拥堵指数(周环比)/3WMA\n",
"跳过指标 美国炼厂原油输入量1周环差\n",
"跳过指标 美国炼厂原油输入量4周均值\n",
"跳过指标 美国柴油期货裂差\n",
"跳过指标 美国墨西哥湾柴油裂差\n",
"跳过指标 美国纽约港柴油裂差\n",
"跳过指标 欧洲柴油期货裂差\n",
"跳过指标 西北欧轻柴油裂差\n",
"跳过指标 新加坡柴油裂差\n",
"跳过指标 美国汽油期货裂差\n",
"跳过指标 美国墨西哥湾汽油裂差\n",
"跳过指标 美国纽约港汽油裂差\n",
"跳过指标 欧洲鹿特丹汽油裂差\n",
"跳过指标 新加坡汽油裂差\n"
]
}
],
"source": [
"# 读取配置\n",
"from lib.dataread import *\n",
"from lib.tools import *\n",
"from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf,model_losss_juxiting\n",
"\n",
"import glob\n",
"import torch\n",
"torch.set_float32_matmul_precision(\"high\")\n",
"\n",
"sqlitedb = SQLiteHandler(db_name) \n",
"sqlitedb.connect()\n",
"\n",
"signature = BinanceAPI(APPID, SECRET)\n",
"etadata = EtaReader(signature=signature,\n",
" classifylisturl = classifylisturl,\n",
" classifyidlisturl=classifyidlisturl,\n",
" edbcodedataurl=edbcodedataurl,\n",
" edbcodelist=edbcodelist,\n",
" edbdatapushurl=edbdatapushurl,\n",
" edbdeleteurl=edbdeleteurl,\n",
" edbbusinessurl=edbbusinessurl\n",
" )\n",
"# 获取数据\n",
"if is_eta:\n",
" # eta数据\n",
" logger.info('从eta获取数据...')\n",
" signature = BinanceAPI(APPID, SECRET)\n",
" etadata = EtaReader(signature=signature,\n",
" classifylisturl = classifylisturl,\n",
" classifyidlisturl=classifyidlisturl,\n",
" edbcodedataurl=edbcodedataurl,\n",
" edbcodelist=edbcodelist,\n",
" edbdatapushurl=edbdatapushurl,\n",
" edbdeleteurl=edbdeleteurl,\n",
" edbbusinessurl=edbbusinessurl,\n",
" )\n",
" df_zhibiaoshuju,df_zhibiaoliebiao = etadata.get_eta_api_yuanyou_data(data_set=data_set,dataset=dataset) # 原始数据,未处理\n",
"\n",
" # 数据处理\n",
" df = datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,y = y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) \n",
"\n",
"else:\n",
" logger.info('读取本地数据:'+os.path.join(dataset,data_set))\n",
" df = getdata(filename=os.path.join(dataset,data_set),y=y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) # 原始数据,未处理\n",
"\n",
"# 更改预测列名称\n",
"df.rename(columns={y:'y'},inplace=True)\n",
" \n",
"if is_edbnamelist:\n",
" df = df[edbnamelist] \n",
"df.to_csv(os.path.join(dataset,'指标数据.csv'), index=False)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae059224-976c-4839-b455-f81da7f25179",
"metadata": {},
"outputs": [],
"source": [
"# 保存最新日期的y值到数据库\n",
"# 取第一行数据存储到数据库中\n",
"first_row = df[['ds','y']].tail(1)\n",
"# 将最新真实值保存到数据库\n",
"if not sqlitedb.check_table_exists('trueandpredict'):\n",
" first_row.to_sql('trueandpredict',sqlitedb.connection,index=False)\n",
"else:\n",
" for row in first_row.itertuples(index=False):\n",
" row_dict = row._asdict()\n",
" row_dict['ds'] = row_dict['ds'].strftime('%Y-%m-%d %H:%M:%S')\n",
" check_query = sqlitedb.select_data('trueandpredict',where_condition = f\"ds = '{row.ds}'\")\n",
" if len(check_query) > 0:\n",
" set_clause = \", \".join([f\"{key} = '{value}'\" for key, value in row_dict.items()])\n",
" sqlitedb.update_data('trueandpredict',set_clause,where_condition = f\"ds = '{row.ds}'\")\n",
" continue\n",
" sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=row_dict.keys())\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abb597fc-c5f3-4d76-8099-5eff358cb634",
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"# 判断当前日期是不是周一\n",
"is_weekday = datetime.datetime.now().weekday() == 1\n",
"if is_weekday:\n",
" logger.info('今天是周一,更新预测模型')\n",
" # 计算最近20天预测残差最低的模型名称\n",
"\n",
" model_results = sqlitedb.select_data('trueandpredict',order_by = \"ds DESC\",limit = \"20\")\n",
" # 删除空值率为40%以上的列\n",
" print(model_results.shape)\n",
" model_results = model_results.dropna(thresh=len(model_results)*0.6,axis=1)\n",
" model_results = model_results.dropna()\n",
" print(model_results.shape)\n",
" modelnames = model_results.columns.to_list()[2:] \n",
" for col in model_results[modelnames].select_dtypes(include=['object']).columns:\n",
" model_results[col] = model_results[col].astype(np.float32)\n",
" # 计算每个预测值与真实值之间的偏差率\n",
" for model in modelnames:\n",
" model_results[f'{model}_abs_error_rate'] = abs(model_results['y'] - model_results[model]) / model_results['y']\n",
"\n",
" # 获取每行对应的最小偏差率值\n",
" min_abs_error_rate_values = model_results.apply(lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].min(), axis=1)\n",
" # 获取每行对应的最小偏差率值对应的列名\n",
" min_abs_error_rate_column_name = model_results.apply(lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].idxmin(), axis=1)\n",
" print(min_abs_error_rate_column_name)\n",
" # 将列名索引转换为列名\n",
" min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0])\n",
" # 取出现次数最多的模型名称\n",
" most_common_model = min_abs_error_rate_column_name.value_counts().idxmax()\n",
" logger.info(f\"最近20天预测残差最低的模型名称{most_common_model}\")\n",
"\n",
" # 保存结果到数据库\n",
" \n",
" if not sqlitedb.check_table_exists('most_model'):\n",
" sqlitedb.create_table('most_model',columns=\"ds datetime, most_common_model TEXT\")\n",
" sqlitedb.insert_data('most_model',(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),most_common_model,),columns=('ds','most_common_model',))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ade7026e-8cf2-405f-a2da-9e90f364adab",
"metadata": {},
"outputs": [],
"source": [
"if is_corr:\n",
" df = corr_feature(df=df)\n",
"\n",
"df1 = df.copy() # 备份一下后面特征筛选完之后加入ds y 列用\n",
"logger.info(f\"开始训练模型...\")\n",
"row,col = df.shape\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfef57d8-36da-423b-bbe7-05a13e15f71b",
"metadata": {},
"outputs": [],
"source": [
"now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')\n",
"ex_Model(df,\n",
" horizon=horizon,\n",
" input_size=input_size,\n",
" train_steps=train_steps,\n",
" val_check_steps=val_check_steps,\n",
" early_stop_patience_steps=early_stop_patience_steps,\n",
" is_debug=is_debug,\n",
" dataset=dataset,\n",
" is_train=is_train,\n",
" is_fivemodels=is_fivemodels,\n",
" val_size=val_size,\n",
" test_size=test_size,\n",
" settings=settings,\n",
" now=now,\n",
" etadata = etadata,\n",
" modelsindex = modelsindex,\n",
" data = data,\n",
" is_eta=is_eta,\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e5b6f30-b7ca-4718-97a3-48b54156e07f",
"metadata": {},
"outputs": [],
"source": [
"logger.info('模型训练完成')\n",
"# # 模型评估\n",
"\n",
"pd.set_option('display.max_columns', 100)\n",
"# 计算预测评估指数\n",
"def model_losss_juxiting(sqlitedb):\n",
" global dataset\n",
" # 数据库查询最佳模型名称\n",
" most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]]\n",
" most_model_name = most_model[0]\n",
"\n",
" # 预测数据处理 predict\n",
" df_combined = loadcsv(os.path.join(dataset,\"cross_validation.csv\")) \n",
" df_combined = dateConvert(df_combined)\n",
" # 删除空列\n",
" df_combined.dropna(axis=1,inplace=True)\n",
" # 删除缺失值,预测过程不能有缺失值\n",
" df_combined.dropna(inplace=True) \n",
" # 其他列转为数值类型\n",
" df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] })\n",
" # 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值\n",
" df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max')\n",
"\n",
" # 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列\n",
" df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']]\n",
" # 删除模型生成的cutoff列\n",
" df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True)\n",
" # 获取模型名称\n",
" modelnames = df_combined.columns.to_list()[1:] \n",
" if 'y' in modelnames:\n",
" modelnames.remove('y')\n",
" df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要\n",
"\n",
"\n",
" # 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE\n",
" cellText = []\n",
"\n",
" # 遍历模型名称,计算模型评估指标 \n",
" for model in modelnames:\n",
" modelmse = mse(df_combined['y'], df_combined[model])\n",
" modelrmse = rmse(df_combined['y'], df_combined[model])\n",
" modelmae = mae(df_combined['y'], df_combined[model])\n",
" # modelmape = mape(df_combined['y'], df_combined[model])\n",
" # modelsmape = smape(df_combined['y'], df_combined[model])\n",
" # modelr2 = r2_score(df_combined['y'], df_combined[model])\n",
" cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)])\n",
" \n",
" model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)'])\n",
" # 按MSE降序排列\n",
" model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True)\n",
" model_results3.to_csv(os.path.join(dataset,\"model_evaluation.csv\"),index=False)\n",
" modelnames = model_results3['模型(Model)'].tolist()\n",
" allmodelnames = modelnames.copy()\n",
" # 保存5个最佳模型的名称\n",
" if len(modelnames) > 5:\n",
" modelnames = modelnames[0:5]\n",
" with open(os.path.join(dataset,\"best_modelnames.txt\"), 'w') as f:\n",
" f.write(','.join(modelnames) + '\\n')\n",
"\n",
"\n",
" # 去掉方差最大的模型,其余模型预测最大最小值确定通道边界\n",
" best_models = pd.read_csv(os.path.join(dataset,'best_modelnames.txt'),header=None).values.flatten().tolist()\n",
" \n",
"\n",
" # 预测值与真实值对比图\n",
" plt.rcParams['font.sans-serif'] = ['SimHei']\n",
" plt.figure(figsize=(15, 10))\n",
" # 设置有5个子图的画布\n",
" for n,model in enumerate(modelnames[:5]):\n",
" plt.subplot(3, 2, n+1)\n",
" plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')\n",
" plt.plot(df_combined3['ds'], df_combined3[model], label=model)\n",
" plt.legend()\n",
" plt.xlabel('日期')\n",
" plt.ylabel('价格')\n",
" plt.title(model+'拟合')\n",
" plt.subplots_adjust(hspace=0.5)\n",
" plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight')\n",
" plt.close()\n",
" \n",
" # 历史数据+预测数据\n",
" # 拼接未来时间预测\n",
" df_predict = loadcsv(os.path.join(dataset,'predict.csv'))\n",
" df_predict.drop('unique_id',inplace=True,axis=1)\n",
" df_predict.dropna(axis=1,inplace=True)\n",
" df_predict2 = df_predict.copy()\n",
" try:\n",
" df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')\n",
" except ValueError :\n",
" df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')\n",
"\n",
" # 取第一行数据存储到数据库中\n",
" first_row = df_predict.head(1)\n",
" first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')\n",
"\n",
" # # 将预测结果保存到数据库\n",
" # # 判断表存在\n",
" if not sqlitedb.check_table_exists('testandpredict_groupby'):\n",
" df_predict2.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)\n",
" else:\n",
" for row in df_predict2.itertuples(index=False):\n",
" row_dict = row._asdict()\n",
" check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f\"ds = '{row.ds}'\")\n",
" if len(check_query) > 0:\n",
" set_clause = \", \".join([f\"{key} = '{value}'\" for key, value in row_dict.items()])\n",
" sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f\"ds = '{row.ds}'\")\n",
" continue\n",
" sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())\n",
"\n",
" df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)\n",
"\n",
" # # 判断 df 的数值列转为float\n",
" for col in df_combined3.columns:\n",
" try:\n",
" if col != 'ds':\n",
" df_combined3[col] = df_combined3[col].astype(float)\n",
" df_combined3[col] = df_combined3[col].round(2)\n",
" except ValueError:\n",
" pass\n",
" df_combined3.to_csv(os.path.join(dataset,\"df_combined3.csv\"),index=False) \n",
" df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)\n",
" df_combined3.to_csv(os.path.join(dataset,\"testandpredict_groupby.csv\"),index=False)\n",
" \n",
" \n",
" ten_models = allmodelnames\n",
" # 计算每个模型的方差\n",
" variances = df_combined3[ten_models].var()\n",
" # 找到方差最大的模型\n",
" max_variance_model = variances.idxmax()\n",
" # 打印方差最大的模型\n",
" print(\"方差最大的模型是:\", max_variance_model)\n",
" # 去掉方差最大的模型\n",
" df_combined3 = df_combined3.drop(columns=[max_variance_model])\n",
" if max_variance_model in allmodelnames:\n",
" allmodelnames.remove(max_variance_model)\n",
" df_combined3['min'] = df_combined3[allmodelnames].min(axis=1)\n",
" df_combined3['max'] = df_combined3[allmodelnames].max(axis=1)\n",
" print(df_combined3[['min','max']])\n",
" # 历史价格+预测价格\n",
" df_combined3 = df_combined3[-50:] # 取50个数据点画图\n",
" plt.figure(figsize=(20, 10))\n",
" plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值',marker='o')\n",
" plt.plot(df_combined3['ds'], df_combined3[most_model], label=most_model_name)\n",
" plt.fill_between(df_combined3['ds'], df_combined3['min'], df_combined3['max'], alpha=0.2)\n",
" plt.grid(True)\n",
" # 当前日期画竖虚线\n",
" plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--')\n",
" plt.legend()\n",
" plt.xlabel('日期')\n",
" plt.ylabel('价格')\n",
"\n",
" # # 显示历史值\n",
" for i, j in zip(df_combined3['ds'][:-5], df_combined3['y'][:-5]):\n",
" plt.text(i, j, str(j), ha='center', va='bottom')\n",
" plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')\n",
" plt.show()\n",
" plt.close()\n",
" \n",
" # 预测值表格\n",
" fig, ax = plt.subplots(figsize=(20, 6))\n",
" ax.axis('off') # 关闭坐标轴\n",
" # 数值保留2位小数\n",
" df_combined3 = df_combined3.round(2)\n",
" df_combined3 = df_combined3[-horizon:]\n",
" df_combined3['Day'] = [f'Day_{i}' for i in range(1,horizon+1)]\n",
" # Day列放到最前面\n",
" df_combined3 = df_combined3[['Day'] + list(df_combined3.columns[:-1])]\n",
" table = ax.table(cellText=df_combined3.values, colLabels=df_combined3.columns, loc='center')\n",
" #加宽表格\n",
" table.auto_set_font_size(False)\n",
" table.set_fontsize(10)\n",
"\n",
" # 设置表格样式,列数据最小的用绿色标识\n",
" plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight')\n",
" plt.close()\n",
" # plt.show()\n",
" \n",
" # 可视化评估结果\n",
" plt.rcParams['font.sans-serif'] = ['SimHei']\n",
" fig, ax = plt.subplots(figsize=(20, 10))\n",
" ax.axis('off') # 关闭坐标轴\n",
" table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center')\n",
" # 加宽表格\n",
" table.auto_set_font_size(False)\n",
" table.set_fontsize(10)\n",
"\n",
" # 设置表格样式,列数据最小的用绿色标识\n",
" plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight')\n",
" plt.close()\n",
" return model_results3\n",
"\n",
"\n",
"\n",
"\n",
"logger.info('训练数据绘图ing')\n",
"model_results3 = model_losss_juxiting(sqlitedb)\n",
"\n",
"logger.info('训练数据绘图end')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85b557de-8235-4e27-b5b8-58b36dfe6724",
"metadata": {},
"outputs": [],
"source": [
"# 模型报告\n",
"\n",
"logger.info('制作报告ing')\n",
"title = f'{settings}--{now}-预测报告' # 报告标题\n",
"\n",
"pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,\n",
" reportname=reportname,sqlitedb=sqlitedb),\n",
"\n",
"logger.info('制作报告end')\n",
"logger.info('模型训练完成')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4129e71-ee2c-4af1-81ed-fadf14efa206",
"metadata": {},
"outputs": [],
"source": [
"# 发送邮件\n",
"m = SendMail(\n",
" username=username,\n",
" passwd=passwd,\n",
" recv=recv,\n",
" title=title,\n",
" content=content,\n",
" file=max(glob.glob(os.path.join(dataset,'*.pdf')), key=os.path.getctime),\n",
" ssl=ssl,\n",
")\n",
"# m.send_mail() \n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}