聚烯烃预测图使用所有模型预测结果80%的置信度

This commit is contained in:
workpc 2024-11-06 14:42:54 +08:00
parent c7d0444c4a
commit ba237cb657
5 changed files with 2138 additions and 1972 deletions

View File

@ -210,8 +210,8 @@ upload_data = {
### 开关
is_train = True # 是否训练
is_debug = False # 是否调试
is_eta = True # 是否使用eta接口
is_debug = True # 是否调试
is_eta = False # 是否使用eta接口
is_timefurture = True # 是否使用时间特征
is_fivemodels = False # 是否使用之前保存的最佳的5个模型
is_edbcode = False # 特征使用edbcoding列表中的

File diff suppressed because it is too large Load Diff

View File

@ -20,8 +20,8 @@ plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
from datetime import timedelta
from config_jingbo import *
# from config_juxiting import *
# from config_jingbo import *
from config_juxiting import *
from sklearn import metrics
from reportlab.pdfbase import pdfmetrics # 注册字体
from reportlab.pdfbase.ttfonts import TTFont # 字体类
@ -225,17 +225,17 @@ def featureAnalysis(df,dataset,y):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10, 10))
# 遍历X每一列和yy画散点图
for i, col in enumerate(X.columns):
plt.subplot(2, 2, i%4+1)
plt.scatter(X[col], yy)
plt.xlabel(col)
plt.ylabel(y)
plt.title(col)
if i % 4 == 3 or i == len(X.columns)-1:
plt.tight_layout()
plt.savefig(os.path.join(dataset,f'{i}指标数据特征与价格散点图.png'))
plt.close()
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(X.columns):
# plt.subplot(2, 2, i%4+1)
# plt.scatter(X[col], yy)
# plt.xlabel(col)
# plt.ylabel(y)
# plt.title(col)
# if i % 4 == 3 or i == len(X.columns)-1:
# plt.tight_layout()
# plt.savefig(os.path.join(dataset,f'{i}指标数据特征与价格散点图.png'))
# plt.close()

58
main.py
View File

@ -1,10 +1,10 @@
# 读取配置
from config_jingbo import *
# from config_jingbo import *
# from config_tansuanli import *
# from config_juxiting import *
from config_juxiting import *
from lib.dataread import *
from lib.tools import *
from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf
from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf,model_losss_juxiting
import glob
import torch
@ -118,36 +118,36 @@ def predict_main():
row,col = df.shape
now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
ex_Model(df,
horizon=horizon,
input_size=input_size,
train_steps=train_steps,
val_check_steps=val_check_steps,
early_stop_patience_steps=early_stop_patience_steps,
is_debug=is_debug,
dataset=dataset,
is_train=is_train,
is_fivemodels=is_fivemodels,
val_size=val_size,
test_size=test_size,
settings=settings,
now=now,
etadata = etadata,
modelsindex = modelsindex,
data = data,
is_eta=is_eta,
)
# ex_Model(df,
# horizon=horizon,
# input_size=input_size,
# train_steps=train_steps,
# val_check_steps=val_check_steps,
# early_stop_patience_steps=early_stop_patience_steps,
# is_debug=is_debug,
# dataset=dataset,
# is_train=is_train,
# is_fivemodels=is_fivemodels,
# val_size=val_size,
# test_size=test_size,
# settings=settings,
# now=now,
# etadata = etadata,
# modelsindex = modelsindex,
# data = data,
# is_eta=is_eta,
# )
# # 模型评估
model_results3 = model_losss(sqlitedb)
model_results3 = model_losss_juxiting(sqlitedb)
# 模型报告
title = f'{settings}--{now}-预测报告' # 报告标题
brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
reportname=reportname,sqlitedb=sqlitedb),
# pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
# reportname=reportname),
logger.info('模型训练完成')
# title = f'{settings}--{now}-预测报告' # 报告标题
# brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
# reportname=reportname,sqlitedb=sqlitedb),
# # pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
# # reportname=reportname),
# logger.info('模型训练完成')
# tansuanli_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,end_time=end_time,reportname=reportname)

View File

@ -396,6 +396,301 @@ def model_losss(sqlitedb):
# 计算每个预测值与真实值之间的偏差率
for model in allmodelnames:
df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y']
# 获取每行对应的最小偏差率值
min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1)
# 获取每行对应的最小偏差率值对应的列名
min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1)
# 将列名索引转换为列名
min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0])
# 获取最小偏差率对应的模型的预测值
min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1)
# 将最小偏差率对应的模型的预测值添加到DataFrame中
df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions
df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name
df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)
# 判断 df 的数值列转为float
for col in df_combined3.columns:
try:
if col != 'ds':
df_combined3[col] = df_combined3[col].astype(float)
df_combined3[col] = df_combined3[col].round(2)
except ValueError:
pass
df_combined3.to_csv(os.path.join(dataset,"df_combined3.csv"),index=False)
# 历史价格+预测价格
df_combined3 = df_combined3[-50:] # 取50个数据点画图
# 历史价格
plt.figure(figsize=(20, 10))
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
# 颜色填充
plt.fill_between(df_combined3['ds'], df_combined3['min_within_quantile'], df_combined3['max_within_quantile'], alpha=0.2)
# plt.plot(df_combined3['ds'], df_combined3['min_abs_error_rate_prediction'], label='最小绝对误差', linestyle='--', color='orange')
# 网格
plt.grid(True)
# 显示历史值
for i, j in zip(df_combined3['ds'], df_combined3['y']):
plt.text(i, j, str(j), ha='center', va='bottom')
# 数据库查询最佳模型名称
most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]]
for model in most_model:
plt.plot(df_combined3['ds'], df_combined3[model], label=model,marker='o')
# 当前日期画竖虚线
plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--')
plt.legend()
plt.xlabel('日期')
plt.ylabel('价格')
plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')
plt.close()
# 预测值表格
fig, ax = plt.subplots(figsize=(20, 6))
ax.axis('off') # 关闭坐标轴
# 数值保留2位小数
df_combined3 = df_combined3.round(2)
df_combined3 = df_combined3[-horizon:]
df_combined3['Day'] = [f'Day_{i}' for i in range(1,horizon+1)]
# Day列放到最前面
df_combined3 = df_combined3[['Day'] + list(df_combined3.columns[:-1])]
table = ax.table(cellText=df_combined3.values, colLabels=df_combined3.columns, loc='center')
#加宽表格
table.auto_set_font_size(False)
table.set_fontsize(10)
# 设置表格样式,列数据最小的用绿色标识
plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight')
plt.close()
# plt.show()
# 可视化评估结果
plt.rcParams['font.sans-serif'] = ['SimHei']
fig, ax = plt.subplots(figsize=(20, 10))
ax.axis('off') # 关闭坐标轴
table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center')
# 加宽表格
table.auto_set_font_size(False)
table.set_fontsize(10)
# 设置表格样式,列数据最小的用绿色标识
plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight')
plt.close()
return model_results3
# 计算预测评估指数
def model_losss_juxiting(sqlitedb):
global dataset
# 预测数据处理 predict
df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv"))
df_combined = dateConvert(df_combined)
# 删除空列
df_combined.dropna(axis=1,inplace=True)
# 删除缺失值,预测过程不能有缺失值
df_combined.dropna(inplace=True)
# 其他列转为数值类型
df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] })
# 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值
df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max')
# 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列
df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']]
# 删除模型生成的cutoff列
df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True)
# 获取模型名称
modelnames = df_combined.columns.to_list()[2:]
if 'y' in modelnames:
modelnames.remove('y')
df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要
# 每行预测值找到10%分位数和90%分位数
def find_min_max_within_quantile(row):
row.drop(['ds','y'], inplace=True)
# 获取分位数10%和90%的值
q10 = row.quantile(0.1)
q90 = row.quantile(0.9)
# 获取 row行10%分位值对应的模型名称
min_model = row[row == q10].idxmin()
max_model = row[row == q90].idxmin()
# # 判断flot值是否为空值
# if pd.isna(q10) or pd.isna(q90):
return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# 遍历行
df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
df_combined = df_combined.round(4)
print(df_combined3)
# # 计算波动率
# df_combined3['volatility'] = df_combined3['y'].pct_change().round(4)
# # 计算近60日的波动率 10% 90%分位数
# df_combined3['quantile_10'] = df_combined3['volatility'].rolling(60).quantile(0.1)
# df_combined3['quantile_90'] = df_combined3['volatility'].rolling(60).quantile(0.9)
# df_combined3 = df_combined3.round(4)
# # 计算分位数对应的价格
# df_combined3['quantile_10_price'] = df_combined3['y'] * (1 + df_combined3['quantile_10'])
# df_combined3['quantile_90_price'] = df_combined3['y'] * (1 + df_combined3['quantile_90'])
# # 遍历行
# def find_min_max_within_quantile(row):
# # 获取分位数10%和90%的值
# q10 = row['quantile_10_price']
# q90 = row['quantile_90_price']
# # 判断flot值是否为空值
# if pd.isna(q10) or pd.isna(q90):
# return pd.Series([None, None, None, None], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 初始化最小和最大值为None
# min_value = None
# max_value = None
# min_value_model = ''
# max_value_model = ''
# # 遍历指定列,找出在分位数范围内的最大最小值
# for model in modelnames:
# value = row[model]
# if value >= q10 and value <= q90:
# if min_value is None or value < min_value:
# min_value = value
# min_value_model = model
# if max_value is None or value > max_value:
# max_value = value
# max_value_model = model
# # 返回最大最小值
# return pd.Series([min_value, max_value,min_value_model,max_value_model], index=['min_within_quantile', 'max_within_quantile','min_model','max_model'])
# # 应用函数到每一行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# 去除有空值的行
df_combined3.dropna(inplace=True)
# 保存到数据库
df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)
df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
# 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE
cellText = []
# 遍历模型名称,计算模型评估指标
for model in modelnames:
modelmse = mse(df_combined['y'], df_combined[model])
modelrmse = rmse(df_combined['y'], df_combined[model])
modelmae = mae(df_combined['y'], df_combined[model])
# modelmape = mape(df_combined['y'], df_combined[model])
# modelsmape = smape(df_combined['y'], df_combined[model])
# modelr2 = r2_score(df_combined['y'], df_combined[model])
cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)])
model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)'])
# 按MSE降序排列
model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True)
model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False)
modelnames = model_results3['模型(Model)'].tolist()
allmodelnames = modelnames.copy()
# 保存5个最佳模型的名称
if len(modelnames) > 5:
modelnames = modelnames[0:5]
with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f:
f.write(','.join(modelnames) + '\n')
# 预测值与真实值对比图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(15, 10))
# 设置有5个子图的画布
for n,model in enumerate(modelnames):
plt.subplot(3, 2, n+1)
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
plt.plot(df_combined3['ds'], df_combined3[model], label=model)
plt.legend()
plt.xlabel('日期')
plt.ylabel('价格')
plt.title(model+'拟合')
plt.subplots_adjust(hspace=0.5)
plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight')
plt.close()
# 历史数据+预测数据
# 拼接未来时间预测
df_predict = loadcsv(os.path.join(dataset,'predict.csv'))
df_predict.drop('unique_id',inplace=True,axis=1)
df_predict.dropna(axis=1,inplace=True)
try:
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')
except ValueError :
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')
# 取第一行数据存储到数据库中
first_row = df_predict.head(1)
first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# 将预测结果保存到数据库
if not sqlitedb.check_table_exists('trueandpredict'):
first_row.to_sql('trueandpredict',sqlitedb.connection,index=False)
else:
for row in first_row.itertuples(index=False):
row_dict = row._asdict()
columns=row_dict.keys()
for col in columns:
sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT')
check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'")
if len(check_query) > 0:
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'")
continue
sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns)
# 最多频率的模型名称
min_model_max_frequency_model = df_combined3['min_model'].value_counts().idxmax()
max_model_max_frequency_model = df_combined3['max_model'].value_counts().idxmax()
df_predict['min_model'] = min_model_max_frequency_model
df_predict['max_model'] = max_model_max_frequency_model
df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model]
df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model]
df_predict2 = df_predict.copy()
df_predict2['ds'] = df_predict2['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# 将预测结果保存到数据库
# 判断表存在
if not sqlitedb.check_table_exists('testandpredict_groupby'):
df_predict2.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)
else:
for row in df_predict2.itertuples(index=False):
row_dict = row._asdict()
check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f"ds = '{row.ds}'")
if len(check_query) > 0:
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f"ds = '{row.ds}'")
continue
sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())
# 计算每个预测值与真实值之间的偏差率
for model in allmodelnames:
df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y']