聚烯烃预测图使用所有模型预测结果80%的置信度
This commit is contained in:
parent
c7d0444c4a
commit
ba237cb657
@ -210,8 +210,8 @@ upload_data = {
|
||||
|
||||
### 开关
|
||||
is_train = True # 是否训练
|
||||
is_debug = False # 是否调试
|
||||
is_eta = True # 是否使用eta接口
|
||||
is_debug = True # 是否调试
|
||||
is_eta = False # 是否使用eta接口
|
||||
is_timefurture = True # 是否使用时间特征
|
||||
is_fivemodels = False # 是否使用之前保存的最佳的5个模型
|
||||
is_edbcode = False # 特征使用edbcoding列表中的
|
||||
|
3727
debugdemo.ipynb
3727
debugdemo.ipynb
File diff suppressed because it is too large
Load Diff
@ -20,8 +20,8 @@ plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
|
||||
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
|
||||
|
||||
from datetime import timedelta
|
||||
from config_jingbo import *
|
||||
# from config_juxiting import *
|
||||
# from config_jingbo import *
|
||||
from config_juxiting import *
|
||||
from sklearn import metrics
|
||||
from reportlab.pdfbase import pdfmetrics # 注册字体
|
||||
from reportlab.pdfbase.ttfonts import TTFont # 字体类
|
||||
@ -225,17 +225,17 @@ def featureAnalysis(df,dataset,y):
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
plt.figure(figsize=(10, 10))
|
||||
# 遍历X每一列,和yy画散点图 ,
|
||||
for i, col in enumerate(X.columns):
|
||||
plt.subplot(2, 2, i%4+1)
|
||||
plt.scatter(X[col], yy)
|
||||
plt.xlabel(col)
|
||||
plt.ylabel(y)
|
||||
plt.title(col)
|
||||
if i % 4 == 3 or i == len(X.columns)-1:
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(dataset,f'{i}指标数据特征与价格散点图.png'))
|
||||
plt.close()
|
||||
# # 遍历X每一列,和yy画散点图 ,
|
||||
# for i, col in enumerate(X.columns):
|
||||
# plt.subplot(2, 2, i%4+1)
|
||||
# plt.scatter(X[col], yy)
|
||||
# plt.xlabel(col)
|
||||
# plt.ylabel(y)
|
||||
# plt.title(col)
|
||||
# if i % 4 == 3 or i == len(X.columns)-1:
|
||||
# plt.tight_layout()
|
||||
# plt.savefig(os.path.join(dataset,f'{i}指标数据特征与价格散点图.png'))
|
||||
# plt.close()
|
||||
|
||||
|
||||
|
||||
|
58
main.py
58
main.py
@ -1,10 +1,10 @@
|
||||
# 读取配置
|
||||
from config_jingbo import *
|
||||
# from config_jingbo import *
|
||||
# from config_tansuanli import *
|
||||
# from config_juxiting import *
|
||||
from config_juxiting import *
|
||||
from lib.dataread import *
|
||||
from lib.tools import *
|
||||
from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf
|
||||
from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf,model_losss_juxiting
|
||||
|
||||
import glob
|
||||
import torch
|
||||
@ -118,36 +118,36 @@ def predict_main():
|
||||
row,col = df.shape
|
||||
|
||||
now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
||||
ex_Model(df,
|
||||
horizon=horizon,
|
||||
input_size=input_size,
|
||||
train_steps=train_steps,
|
||||
val_check_steps=val_check_steps,
|
||||
early_stop_patience_steps=early_stop_patience_steps,
|
||||
is_debug=is_debug,
|
||||
dataset=dataset,
|
||||
is_train=is_train,
|
||||
is_fivemodels=is_fivemodels,
|
||||
val_size=val_size,
|
||||
test_size=test_size,
|
||||
settings=settings,
|
||||
now=now,
|
||||
etadata = etadata,
|
||||
modelsindex = modelsindex,
|
||||
data = data,
|
||||
is_eta=is_eta,
|
||||
)
|
||||
# ex_Model(df,
|
||||
# horizon=horizon,
|
||||
# input_size=input_size,
|
||||
# train_steps=train_steps,
|
||||
# val_check_steps=val_check_steps,
|
||||
# early_stop_patience_steps=early_stop_patience_steps,
|
||||
# is_debug=is_debug,
|
||||
# dataset=dataset,
|
||||
# is_train=is_train,
|
||||
# is_fivemodels=is_fivemodels,
|
||||
# val_size=val_size,
|
||||
# test_size=test_size,
|
||||
# settings=settings,
|
||||
# now=now,
|
||||
# etadata = etadata,
|
||||
# modelsindex = modelsindex,
|
||||
# data = data,
|
||||
# is_eta=is_eta,
|
||||
# )
|
||||
|
||||
# # 模型评估
|
||||
model_results3 = model_losss(sqlitedb)
|
||||
model_results3 = model_losss_juxiting(sqlitedb)
|
||||
# 模型报告
|
||||
|
||||
title = f'{settings}--{now}-预测报告' # 报告标题
|
||||
brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
|
||||
reportname=reportname,sqlitedb=sqlitedb),
|
||||
# pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
|
||||
# reportname=reportname),
|
||||
logger.info('模型训练完成')
|
||||
# title = f'{settings}--{now}-预测报告' # 报告标题
|
||||
# brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
|
||||
# reportname=reportname,sqlitedb=sqlitedb),
|
||||
# # pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
|
||||
# # reportname=reportname),
|
||||
# logger.info('模型训练完成')
|
||||
|
||||
# tansuanli_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,end_time=end_time,reportname=reportname)
|
||||
|
||||
|
@ -396,6 +396,301 @@ def model_losss(sqlitedb):
|
||||
|
||||
|
||||
|
||||
# 计算每个预测值与真实值之间的偏差率
|
||||
for model in allmodelnames:
|
||||
df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y']
|
||||
|
||||
# 获取每行对应的最小偏差率值
|
||||
min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1)
|
||||
# 获取每行对应的最小偏差率值对应的列名
|
||||
min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1)
|
||||
# 将列名索引转换为列名
|
||||
min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0])
|
||||
# 获取最小偏差率对应的模型的预测值
|
||||
min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1)
|
||||
# 将最小偏差率对应的模型的预测值添加到DataFrame中
|
||||
df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions
|
||||
df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name
|
||||
df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)
|
||||
# 判断 df 的数值列转为float
|
||||
for col in df_combined3.columns:
|
||||
try:
|
||||
if col != 'ds':
|
||||
df_combined3[col] = df_combined3[col].astype(float)
|
||||
df_combined3[col] = df_combined3[col].round(2)
|
||||
except ValueError:
|
||||
pass
|
||||
df_combined3.to_csv(os.path.join(dataset,"df_combined3.csv"),index=False)
|
||||
|
||||
# 历史价格+预测价格
|
||||
df_combined3 = df_combined3[-50:] # 取50个数据点画图
|
||||
# 历史价格
|
||||
plt.figure(figsize=(20, 10))
|
||||
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
|
||||
# 颜色填充
|
||||
plt.fill_between(df_combined3['ds'], df_combined3['min_within_quantile'], df_combined3['max_within_quantile'], alpha=0.2)
|
||||
# plt.plot(df_combined3['ds'], df_combined3['min_abs_error_rate_prediction'], label='最小绝对误差', linestyle='--', color='orange')
|
||||
# 网格
|
||||
plt.grid(True)
|
||||
# 显示历史值
|
||||
for i, j in zip(df_combined3['ds'], df_combined3['y']):
|
||||
plt.text(i, j, str(j), ha='center', va='bottom')
|
||||
|
||||
# 数据库查询最佳模型名称
|
||||
most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]]
|
||||
|
||||
for model in most_model:
|
||||
plt.plot(df_combined3['ds'], df_combined3[model], label=model,marker='o')
|
||||
# 当前日期画竖虚线
|
||||
plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--')
|
||||
plt.legend()
|
||||
plt.xlabel('日期')
|
||||
plt.ylabel('价格')
|
||||
|
||||
plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# 预测值表格
|
||||
fig, ax = plt.subplots(figsize=(20, 6))
|
||||
ax.axis('off') # 关闭坐标轴
|
||||
# 数值保留2位小数
|
||||
df_combined3 = df_combined3.round(2)
|
||||
df_combined3 = df_combined3[-horizon:]
|
||||
df_combined3['Day'] = [f'Day_{i}' for i in range(1,horizon+1)]
|
||||
# Day列放到最前面
|
||||
df_combined3 = df_combined3[['Day'] + list(df_combined3.columns[:-1])]
|
||||
table = ax.table(cellText=df_combined3.values, colLabels=df_combined3.columns, loc='center')
|
||||
#加宽表格
|
||||
table.auto_set_font_size(False)
|
||||
table.set_fontsize(10)
|
||||
|
||||
# 设置表格样式,列数据最小的用绿色标识
|
||||
plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight')
|
||||
plt.close()
|
||||
# plt.show()
|
||||
|
||||
# 可视化评估结果
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
fig, ax = plt.subplots(figsize=(20, 10))
|
||||
ax.axis('off') # 关闭坐标轴
|
||||
table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center')
|
||||
# 加宽表格
|
||||
table.auto_set_font_size(False)
|
||||
table.set_fontsize(10)
|
||||
|
||||
# 设置表格样式,列数据最小的用绿色标识
|
||||
plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight')
|
||||
plt.close()
|
||||
return model_results3
|
||||
|
||||
# 计算预测评估指数
|
||||
def model_losss_juxiting(sqlitedb):
|
||||
global dataset
|
||||
# 预测数据处理 predict
|
||||
df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv"))
|
||||
df_combined = dateConvert(df_combined)
|
||||
# 删除空列
|
||||
df_combined.dropna(axis=1,inplace=True)
|
||||
# 删除缺失值,预测过程不能有缺失值
|
||||
df_combined.dropna(inplace=True)
|
||||
# 其他列转为数值类型
|
||||
df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] })
|
||||
# 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值
|
||||
df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max')
|
||||
|
||||
# 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列
|
||||
df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']]
|
||||
# 删除模型生成的cutoff列
|
||||
df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True)
|
||||
# 获取模型名称
|
||||
modelnames = df_combined.columns.to_list()[2:]
|
||||
if 'y' in modelnames:
|
||||
modelnames.remove('y')
|
||||
df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要
|
||||
|
||||
# 每行预测值找到10%分位数和90%分位数
|
||||
def find_min_max_within_quantile(row):
|
||||
row.drop(['ds','y'], inplace=True)
|
||||
# 获取分位数10%和90%的值
|
||||
q10 = row.quantile(0.1)
|
||||
q90 = row.quantile(0.9)
|
||||
# 获取 row行10%分位值对应的模型名称
|
||||
min_model = row[row == q10].idxmin()
|
||||
max_model = row[row == q90].idxmin()
|
||||
|
||||
# # 判断flot值是否为空值
|
||||
# if pd.isna(q10) or pd.isna(q90):
|
||||
return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
|
||||
|
||||
# 遍历行
|
||||
df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
|
||||
df_combined = df_combined.round(4)
|
||||
print(df_combined3)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # 计算波动率
|
||||
# df_combined3['volatility'] = df_combined3['y'].pct_change().round(4)
|
||||
# # 计算近60日的波动率 10% 90%分位数
|
||||
# df_combined3['quantile_10'] = df_combined3['volatility'].rolling(60).quantile(0.1)
|
||||
# df_combined3['quantile_90'] = df_combined3['volatility'].rolling(60).quantile(0.9)
|
||||
# df_combined3 = df_combined3.round(4)
|
||||
# # 计算分位数对应的价格
|
||||
# df_combined3['quantile_10_price'] = df_combined3['y'] * (1 + df_combined3['quantile_10'])
|
||||
# df_combined3['quantile_90_price'] = df_combined3['y'] * (1 + df_combined3['quantile_90'])
|
||||
|
||||
# # 遍历行
|
||||
# def find_min_max_within_quantile(row):
|
||||
# # 获取分位数10%和90%的值
|
||||
# q10 = row['quantile_10_price']
|
||||
# q90 = row['quantile_90_price']
|
||||
|
||||
# # 判断flot值是否为空值
|
||||
# if pd.isna(q10) or pd.isna(q90):
|
||||
# return pd.Series([None, None, None, None], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
|
||||
|
||||
# # 初始化最小和最大值为None
|
||||
# min_value = None
|
||||
# max_value = None
|
||||
# min_value_model = ''
|
||||
# max_value_model = ''
|
||||
|
||||
|
||||
# # 遍历指定列,找出在分位数范围内的最大最小值
|
||||
# for model in modelnames:
|
||||
# value = row[model]
|
||||
# if value >= q10 and value <= q90:
|
||||
# if min_value is None or value < min_value:
|
||||
# min_value = value
|
||||
# min_value_model = model
|
||||
|
||||
# if max_value is None or value > max_value:
|
||||
# max_value = value
|
||||
# max_value_model = model
|
||||
|
||||
# # 返回最大最小值
|
||||
# return pd.Series([min_value, max_value,min_value_model,max_value_model], index=['min_within_quantile', 'max_within_quantile','min_model','max_model'])
|
||||
|
||||
# # 应用函数到每一行
|
||||
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
|
||||
|
||||
# 去除有空值的行
|
||||
df_combined3.dropna(inplace=True)
|
||||
# 保存到数据库
|
||||
df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)
|
||||
df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
|
||||
|
||||
|
||||
# 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE
|
||||
cellText = []
|
||||
|
||||
# 遍历模型名称,计算模型评估指标
|
||||
for model in modelnames:
|
||||
modelmse = mse(df_combined['y'], df_combined[model])
|
||||
modelrmse = rmse(df_combined['y'], df_combined[model])
|
||||
modelmae = mae(df_combined['y'], df_combined[model])
|
||||
# modelmape = mape(df_combined['y'], df_combined[model])
|
||||
# modelsmape = smape(df_combined['y'], df_combined[model])
|
||||
# modelr2 = r2_score(df_combined['y'], df_combined[model])
|
||||
cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)])
|
||||
|
||||
model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)'])
|
||||
# 按MSE降序排列
|
||||
model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True)
|
||||
model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False)
|
||||
modelnames = model_results3['模型(Model)'].tolist()
|
||||
allmodelnames = modelnames.copy()
|
||||
# 保存5个最佳模型的名称
|
||||
if len(modelnames) > 5:
|
||||
modelnames = modelnames[0:5]
|
||||
with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f:
|
||||
f.write(','.join(modelnames) + '\n')
|
||||
|
||||
# 预测值与真实值对比图
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.figure(figsize=(15, 10))
|
||||
# 设置有5个子图的画布
|
||||
for n,model in enumerate(modelnames):
|
||||
plt.subplot(3, 2, n+1)
|
||||
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
|
||||
plt.plot(df_combined3['ds'], df_combined3[model], label=model)
|
||||
plt.legend()
|
||||
plt.xlabel('日期')
|
||||
plt.ylabel('价格')
|
||||
plt.title(model+'拟合')
|
||||
plt.subplots_adjust(hspace=0.5)
|
||||
plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# 历史数据+预测数据
|
||||
# 拼接未来时间预测
|
||||
df_predict = loadcsv(os.path.join(dataset,'predict.csv'))
|
||||
df_predict.drop('unique_id',inplace=True,axis=1)
|
||||
df_predict.dropna(axis=1,inplace=True)
|
||||
|
||||
try:
|
||||
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')
|
||||
except ValueError :
|
||||
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')
|
||||
|
||||
# 取第一行数据存储到数据库中
|
||||
first_row = df_predict.head(1)
|
||||
first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')
|
||||
# 将预测结果保存到数据库
|
||||
if not sqlitedb.check_table_exists('trueandpredict'):
|
||||
first_row.to_sql('trueandpredict',sqlitedb.connection,index=False)
|
||||
else:
|
||||
for row in first_row.itertuples(index=False):
|
||||
row_dict = row._asdict()
|
||||
columns=row_dict.keys()
|
||||
for col in columns:
|
||||
sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT')
|
||||
check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'")
|
||||
if len(check_query) > 0:
|
||||
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
|
||||
sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'")
|
||||
continue
|
||||
sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns)
|
||||
|
||||
|
||||
|
||||
# 最多频率的模型名称
|
||||
min_model_max_frequency_model = df_combined3['min_model'].value_counts().idxmax()
|
||||
max_model_max_frequency_model = df_combined3['max_model'].value_counts().idxmax()
|
||||
df_predict['min_model'] = min_model_max_frequency_model
|
||||
df_predict['max_model'] = max_model_max_frequency_model
|
||||
df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model]
|
||||
df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model]
|
||||
|
||||
df_predict2 = df_predict.copy()
|
||||
df_predict2['ds'] = df_predict2['ds'].dt.strftime('%Y-%m-%d 00:00:00')
|
||||
|
||||
|
||||
# 将预测结果保存到数据库
|
||||
# 判断表存在
|
||||
if not sqlitedb.check_table_exists('testandpredict_groupby'):
|
||||
df_predict2.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)
|
||||
else:
|
||||
for row in df_predict2.itertuples(index=False):
|
||||
row_dict = row._asdict()
|
||||
check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f"ds = '{row.ds}'")
|
||||
if len(check_query) > 0:
|
||||
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
|
||||
sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f"ds = '{row.ds}'")
|
||||
continue
|
||||
sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())
|
||||
|
||||
|
||||
|
||||
# 计算每个预测值与真实值之间的偏差率
|
||||
for model in allmodelnames:
|
||||
df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y']
|
||||
|
Loading…
Reference in New Issue
Block a user