PriceForecast/原油预测值绘图调试 copy.ipynb

496 lines
22 KiB
Plaintext
Raw Normal View History

2024-11-20 14:14:27 +08:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "7fadc60c-d710-4b8c-89cd-1d889ece1eaf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"从eta获取数据...\n",
"跳过指标 美国:东海岸地区:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:洛基山地区:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:墨西哥湾沿岸:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:西海岸地区:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 美国:中西部地区:炼油厂的投入与使用情况:开工率:四周均值\n",
"跳过指标 中国航班执行数/7DMA\n",
"跳过指标 美国汽油表需周度1周环差负值\n",
"跳过指标 美国汽油表需周度1周环差\n",
"跳过指标 美国汽油产量周度1周环差\n",
"跳过指标 美国原油周度表需/4WMA\n",
"跳过指标 美国油品表需/4WMA\n",
"跳过指标 美国油品表需4周环差\n",
"跳过指标 道琼斯旅游与休闲/标普500\n",
"跳过指标 DOE-柴油产量1周环差\n",
"跳过指标 DOE-美国汽油产量1周环差\n",
"跳过指标 美国航煤表需一周环差负值\n",
"跳过指标 美国馏分油表需一周环差负值\n",
"跳过指标 美国汽油表需一周环差负值\n",
"跳过指标 欧洲汽油裂差\n",
"跳过指标 欧洲柴油裂差\n",
"跳过指标 中国主营炼厂产能利用率1周环差\n",
"跳过指标 美国零售拥堵指数(周环比)/3WMA\n",
"跳过指标 美国炼厂原油输入量1周环差\n",
"跳过指标 美国炼厂原油输入量4周均值\n",
"跳过指标 美国柴油期货裂差\n",
"跳过指标 美国墨西哥湾柴油裂差\n",
"跳过指标 美国纽约港柴油裂差\n",
"跳过指标 欧洲柴油期货裂差\n",
"跳过指标 西北欧轻柴油裂差\n",
"跳过指标 新加坡柴油裂差\n",
"跳过指标 美国汽油期货裂差\n",
"跳过指标 美国墨西哥湾汽油裂差\n",
"跳过指标 美国纽约港汽油裂差\n",
"跳过指标 欧洲鹿特丹汽油裂差\n",
"跳过指标 新加坡汽油裂差\n"
]
}
],
"source": [
"# 读取配置\n",
"from lib.dataread import *\n",
"from lib.tools import *\n",
"from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf,model_losss_juxiting\n",
"\n",
"import glob\n",
"import torch\n",
"torch.set_float32_matmul_precision(\"high\")\n",
"\n",
"sqlitedb = SQLiteHandler(db_name) \n",
"sqlitedb.connect()\n",
"\n",
"signature = BinanceAPI(APPID, SECRET)\n",
"etadata = EtaReader(signature=signature,\n",
" classifylisturl = classifylisturl,\n",
" classifyidlisturl=classifyidlisturl,\n",
" edbcodedataurl=edbcodedataurl,\n",
" edbcodelist=edbcodelist,\n",
" edbdatapushurl=edbdatapushurl,\n",
" edbdeleteurl=edbdeleteurl,\n",
" edbbusinessurl=edbbusinessurl\n",
" )\n",
"# 获取数据\n",
"if is_eta:\n",
" # eta数据\n",
" logger.info('从eta获取数据...')\n",
" signature = BinanceAPI(APPID, SECRET)\n",
" etadata = EtaReader(signature=signature,\n",
" classifylisturl = classifylisturl,\n",
" classifyidlisturl=classifyidlisturl,\n",
" edbcodedataurl=edbcodedataurl,\n",
" edbcodelist=edbcodelist,\n",
" edbdatapushurl=edbdatapushurl,\n",
" edbdeleteurl=edbdeleteurl,\n",
" edbbusinessurl=edbbusinessurl,\n",
" )\n",
" df_zhibiaoshuju,df_zhibiaoliebiao = etadata.get_eta_api_yuanyou_data(data_set=data_set,dataset=dataset) # 原始数据,未处理\n",
"\n",
" # 数据处理\n",
" df = datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,y = y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) \n",
"\n",
"else:\n",
" logger.info('读取本地数据:'+os.path.join(dataset,data_set))\n",
" df = getdata(filename=os.path.join(dataset,data_set),y=y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) # 原始数据,未处理\n",
"\n",
"# 更改预测列名称\n",
"df.rename(columns={y:'y'},inplace=True)\n",
" \n",
"if is_edbnamelist:\n",
" df = df[edbnamelist] \n",
"df.to_csv(os.path.join(dataset,'指标数据.csv'), index=False)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae059224-976c-4839-b455-f81da7f25179",
"metadata": {},
"outputs": [],
"source": [
"# 保存最新日期的y值到数据库\n",
"# 取第一行数据存储到数据库中\n",
"first_row = df[['ds','y']].tail(1)\n",
"# 将最新真实值保存到数据库\n",
"if not sqlitedb.check_table_exists('trueandpredict'):\n",
" first_row.to_sql('trueandpredict',sqlitedb.connection,index=False)\n",
"else:\n",
" for row in first_row.itertuples(index=False):\n",
" row_dict = row._asdict()\n",
" row_dict['ds'] = row_dict['ds'].strftime('%Y-%m-%d %H:%M:%S')\n",
" check_query = sqlitedb.select_data('trueandpredict',where_condition = f\"ds = '{row.ds}'\")\n",
" if len(check_query) > 0:\n",
" set_clause = \", \".join([f\"{key} = '{value}'\" for key, value in row_dict.items()])\n",
" sqlitedb.update_data('trueandpredict',set_clause,where_condition = f\"ds = '{row.ds}'\")\n",
" continue\n",
" sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=row_dict.keys())\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abb597fc-c5f3-4d76-8099-5eff358cb634",
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"# 判断当前日期是不是周一\n",
"is_weekday = datetime.datetime.now().weekday() == 1\n",
"if is_weekday:\n",
" logger.info('今天是周一,更新预测模型')\n",
" # 计算最近20天预测残差最低的模型名称\n",
"\n",
" model_results = sqlitedb.select_data('trueandpredict',order_by = \"ds DESC\",limit = \"20\")\n",
" # 删除空值率为40%以上的列\n",
" print(model_results.shape)\n",
" model_results = model_results.dropna(thresh=len(model_results)*0.6,axis=1)\n",
" model_results = model_results.dropna()\n",
" print(model_results.shape)\n",
" modelnames = model_results.columns.to_list()[2:] \n",
" for col in model_results[modelnames].select_dtypes(include=['object']).columns:\n",
" model_results[col] = model_results[col].astype(np.float32)\n",
" # 计算每个预测值与真实值之间的偏差率\n",
" for model in modelnames:\n",
" model_results[f'{model}_abs_error_rate'] = abs(model_results['y'] - model_results[model]) / model_results['y']\n",
"\n",
" # 获取每行对应的最小偏差率值\n",
" min_abs_error_rate_values = model_results.apply(lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].min(), axis=1)\n",
" # 获取每行对应的最小偏差率值对应的列名\n",
" min_abs_error_rate_column_name = model_results.apply(lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].idxmin(), axis=1)\n",
" print(min_abs_error_rate_column_name)\n",
" # 将列名索引转换为列名\n",
" min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0])\n",
" # 取出现次数最多的模型名称\n",
" most_common_model = min_abs_error_rate_column_name.value_counts().idxmax()\n",
" logger.info(f\"最近20天预测残差最低的模型名称{most_common_model}\")\n",
"\n",
" # 保存结果到数据库\n",
" \n",
" if not sqlitedb.check_table_exists('most_model'):\n",
" sqlitedb.create_table('most_model',columns=\"ds datetime, most_common_model TEXT\")\n",
" sqlitedb.insert_data('most_model',(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),most_common_model,),columns=('ds','most_common_model',))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ade7026e-8cf2-405f-a2da-9e90f364adab",
"metadata": {},
"outputs": [],
"source": [
"if is_corr:\n",
" df = corr_feature(df=df)\n",
"\n",
"df1 = df.copy() # 备份一下后面特征筛选完之后加入ds y 列用\n",
"logger.info(f\"开始训练模型...\")\n",
"row,col = df.shape\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfef57d8-36da-423b-bbe7-05a13e15f71b",
"metadata": {},
"outputs": [],
"source": [
"now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')\n",
"ex_Model(df,\n",
" horizon=horizon,\n",
" input_size=input_size,\n",
" train_steps=train_steps,\n",
" val_check_steps=val_check_steps,\n",
" early_stop_patience_steps=early_stop_patience_steps,\n",
" is_debug=is_debug,\n",
" dataset=dataset,\n",
" is_train=is_train,\n",
" is_fivemodels=is_fivemodels,\n",
" val_size=val_size,\n",
" test_size=test_size,\n",
" settings=settings,\n",
" now=now,\n",
" etadata = etadata,\n",
" modelsindex = modelsindex,\n",
" data = data,\n",
" is_eta=is_eta,\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e5b6f30-b7ca-4718-97a3-48b54156e07f",
"metadata": {},
"outputs": [],
"source": [
"logger.info('模型训练完成')\n",
"# # 模型评估\n",
"\n",
"pd.set_option('display.max_columns', 100)\n",
"# 计算预测评估指数\n",
"def model_losss_juxiting(sqlitedb):\n",
" global dataset\n",
" # 数据库查询最佳模型名称\n",
" most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]]\n",
" most_model_name = most_model[0]\n",
"\n",
" # 预测数据处理 predict\n",
" df_combined = loadcsv(os.path.join(dataset,\"cross_validation.csv\")) \n",
" df_combined = dateConvert(df_combined)\n",
" # 删除空列\n",
" df_combined.dropna(axis=1,inplace=True)\n",
" # 删除缺失值,预测过程不能有缺失值\n",
" df_combined.dropna(inplace=True) \n",
" # 其他列转为数值类型\n",
" df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] })\n",
" # 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值\n",
" df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max')\n",
"\n",
" # 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列\n",
" df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']]\n",
" # 删除模型生成的cutoff列\n",
" df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True)\n",
" # 获取模型名称\n",
" modelnames = df_combined.columns.to_list()[1:] \n",
" if 'y' in modelnames:\n",
" modelnames.remove('y')\n",
" df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要\n",
"\n",
"\n",
" # 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE\n",
" cellText = []\n",
"\n",
" # 遍历模型名称,计算模型评估指标 \n",
" for model in modelnames:\n",
" modelmse = mse(df_combined['y'], df_combined[model])\n",
" modelrmse = rmse(df_combined['y'], df_combined[model])\n",
" modelmae = mae(df_combined['y'], df_combined[model])\n",
" # modelmape = mape(df_combined['y'], df_combined[model])\n",
" # modelsmape = smape(df_combined['y'], df_combined[model])\n",
" # modelr2 = r2_score(df_combined['y'], df_combined[model])\n",
" cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)])\n",
" \n",
" model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)'])\n",
" # 按MSE降序排列\n",
" model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True)\n",
" model_results3.to_csv(os.path.join(dataset,\"model_evaluation.csv\"),index=False)\n",
" modelnames = model_results3['模型(Model)'].tolist()\n",
" allmodelnames = modelnames.copy()\n",
" # 保存5个最佳模型的名称\n",
" if len(modelnames) > 5:\n",
" modelnames = modelnames[0:5]\n",
" with open(os.path.join(dataset,\"best_modelnames.txt\"), 'w') as f:\n",
" f.write(','.join(modelnames) + '\\n')\n",
"\n",
"\n",
" # 去掉方差最大的模型,其余模型预测最大最小值确定通道边界\n",
" best_models = pd.read_csv(os.path.join(dataset,'best_modelnames.txt'),header=None).values.flatten().tolist()\n",
" \n",
"\n",
" # 预测值与真实值对比图\n",
" plt.rcParams['font.sans-serif'] = ['SimHei']\n",
" plt.figure(figsize=(15, 10))\n",
" # 设置有5个子图的画布\n",
" for n,model in enumerate(modelnames[:5]):\n",
" plt.subplot(3, 2, n+1)\n",
" plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')\n",
" plt.plot(df_combined3['ds'], df_combined3[model], label=model)\n",
" plt.legend()\n",
" plt.xlabel('日期')\n",
" plt.ylabel('价格')\n",
" plt.title(model+'拟合')\n",
" plt.subplots_adjust(hspace=0.5)\n",
" plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight')\n",
" plt.close()\n",
" \n",
" # 历史数据+预测数据\n",
" # 拼接未来时间预测\n",
" df_predict = loadcsv(os.path.join(dataset,'predict.csv'))\n",
" df_predict.drop('unique_id',inplace=True,axis=1)\n",
" df_predict.dropna(axis=1,inplace=True)\n",
" df_predict2 = df_predict.copy()\n",
" try:\n",
" df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')\n",
" except ValueError :\n",
" df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')\n",
"\n",
" # 取第一行数据存储到数据库中\n",
" first_row = df_predict.head(1)\n",
" first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')\n",
"\n",
" # # 将预测结果保存到数据库\n",
" # # 判断表存在\n",
" if not sqlitedb.check_table_exists('testandpredict_groupby'):\n",
" df_predict2.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)\n",
" else:\n",
" for row in df_predict2.itertuples(index=False):\n",
" row_dict = row._asdict()\n",
" check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f\"ds = '{row.ds}'\")\n",
" if len(check_query) > 0:\n",
" set_clause = \", \".join([f\"{key} = '{value}'\" for key, value in row_dict.items()])\n",
" sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f\"ds = '{row.ds}'\")\n",
" continue\n",
" sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())\n",
"\n",
" df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)\n",
"\n",
" # # 判断 df 的数值列转为float\n",
" for col in df_combined3.columns:\n",
" try:\n",
" if col != 'ds':\n",
" df_combined3[col] = df_combined3[col].astype(float)\n",
" df_combined3[col] = df_combined3[col].round(2)\n",
" except ValueError:\n",
" pass\n",
" df_combined3.to_csv(os.path.join(dataset,\"df_combined3.csv\"),index=False) \n",
" df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)\n",
" df_combined3.to_csv(os.path.join(dataset,\"testandpredict_groupby.csv\"),index=False)\n",
" \n",
" \n",
" ten_models = allmodelnames\n",
" # 计算每个模型的方差\n",
" variances = df_combined3[ten_models].var()\n",
" # 找到方差最大的模型\n",
" max_variance_model = variances.idxmax()\n",
" # 打印方差最大的模型\n",
" print(\"方差最大的模型是:\", max_variance_model)\n",
" # 去掉方差最大的模型\n",
" df_combined3 = df_combined3.drop(columns=[max_variance_model])\n",
" if max_variance_model in allmodelnames:\n",
" allmodelnames.remove(max_variance_model)\n",
" df_combined3['min'] = df_combined3[allmodelnames].min(axis=1)\n",
" df_combined3['max'] = df_combined3[allmodelnames].max(axis=1)\n",
" print(df_combined3[['min','max']])\n",
" # 历史价格+预测价格\n",
" df_combined3 = df_combined3[-50:] # 取50个数据点画图\n",
" plt.figure(figsize=(20, 10))\n",
" plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值',marker='o')\n",
" plt.plot(df_combined3['ds'], df_combined3[most_model], label=most_model_name)\n",
" plt.fill_between(df_combined3['ds'], df_combined3['min'], df_combined3['max'], alpha=0.2)\n",
" plt.grid(True)\n",
" # 当前日期画竖虚线\n",
" plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--')\n",
" plt.legend()\n",
" plt.xlabel('日期')\n",
" plt.ylabel('价格')\n",
"\n",
" # # 显示历史值\n",
" for i, j in zip(df_combined3['ds'][:-5], df_combined3['y'][:-5]):\n",
" plt.text(i, j, str(j), ha='center', va='bottom')\n",
" plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')\n",
" plt.show()\n",
" plt.close()\n",
" \n",
" # 预测值表格\n",
" fig, ax = plt.subplots(figsize=(20, 6))\n",
" ax.axis('off') # 关闭坐标轴\n",
" # 数值保留2位小数\n",
" df_combined3 = df_combined3.round(2)\n",
" df_combined3 = df_combined3[-horizon:]\n",
" df_combined3['Day'] = [f'Day_{i}' for i in range(1,horizon+1)]\n",
" # Day列放到最前面\n",
" df_combined3 = df_combined3[['Day'] + list(df_combined3.columns[:-1])]\n",
" table = ax.table(cellText=df_combined3.values, colLabels=df_combined3.columns, loc='center')\n",
" #加宽表格\n",
" table.auto_set_font_size(False)\n",
" table.set_fontsize(10)\n",
"\n",
" # 设置表格样式,列数据最小的用绿色标识\n",
" plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight')\n",
" plt.close()\n",
" # plt.show()\n",
" \n",
" # 可视化评估结果\n",
" plt.rcParams['font.sans-serif'] = ['SimHei']\n",
" fig, ax = plt.subplots(figsize=(20, 10))\n",
" ax.axis('off') # 关闭坐标轴\n",
" table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center')\n",
" # 加宽表格\n",
" table.auto_set_font_size(False)\n",
" table.set_fontsize(10)\n",
"\n",
" # 设置表格样式,列数据最小的用绿色标识\n",
" plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight')\n",
" plt.close()\n",
" return model_results3\n",
"\n",
"\n",
"\n",
"\n",
"logger.info('训练数据绘图ing')\n",
"model_results3 = model_losss_juxiting(sqlitedb)\n",
"\n",
"logger.info('训练数据绘图end')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85b557de-8235-4e27-b5b8-58b36dfe6724",
"metadata": {},
"outputs": [],
"source": [
"# 模型报告\n",
"\n",
"logger.info('制作报告ing')\n",
"title = f'{settings}--{now}-预测报告' # 报告标题\n",
"\n",
"pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,\n",
" reportname=reportname,sqlitedb=sqlitedb),\n",
"\n",
"logger.info('制作报告end')\n",
"logger.info('模型训练完成')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4129e71-ee2c-4af1-81ed-fadf14efa206",
"metadata": {},
"outputs": [],
"source": [
"# 发送邮件\n",
"m = SendMail(\n",
" username=username,\n",
" passwd=passwd,\n",
" recv=recv,\n",
" title=title,\n",
" content=content,\n",
" file=max(glob.glob(os.path.join(dataset,'*.pdf')), key=os.path.getctime),\n",
" ssl=ssl,\n",
")\n",
"# m.send_mail() \n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}