聚烯烃预测图使用所有模型预测结果80%的置信度

2024-11-06 14:42:54 +08:00 · 2024-11-06 14:42:54 +08:00 · ba237cb657
commit ba237cb657
parent c7d0444c4a
5 changed files with 2138 additions and 1972 deletions
--- a/config_juxiting.py
+++ b/config_juxiting.py
@ -210,8 +210,8 @@ upload_data = {

 ### 开关
 is_train = True # 是否训练
-is_debug = False # 是否调试
-is_eta = True # 是否使用eta接口
+is_debug = True # 是否调试
+is_eta = False # 是否使用eta接口
 is_timefurture = True # 是否使用时间特征
 is_fivemodels = False # 是否使用之前保存的最佳的5个模型
 is_edbcode = False # 特征使用edbcoding列表中的
--- a/debugdemo.ipynb
+++ b/debugdemo.ipynb
--- a/lib/dataread.py
+++ b/lib/dataread.py
@ -20,8 +20,8 @@ plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
 plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

 from datetime import timedelta
-from config_jingbo import *
-# from config_juxiting import *
+# from config_jingbo import *
+from config_juxiting import *
 from sklearn import metrics
 from reportlab.pdfbase import pdfmetrics   # 注册字体
 from reportlab.pdfbase.ttfonts import TTFont # 字体类
@ -225,17 +225,17 @@ def featureAnalysis(df,dataset,y):
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False  
    plt.figure(figsize=(10, 10))
-    # 遍历X每一列，和yy画散点图 ，
-    for i, col in enumerate(X.columns):
-        plt.subplot(2, 2, i%4+1)
-        plt.scatter(X[col], yy)
-        plt.xlabel(col)
-        plt.ylabel(y)
-        plt.title(col)
-        if i % 4 == 3  or i == len(X.columns)-1:
-            plt.tight_layout()
-            plt.savefig(os.path.join(dataset,f'{i}指标数据特征与价格散点图.png'))
-            plt.close()
+    # # 遍历X每一列，和yy画散点图 ，
+    # for i, col in enumerate(X.columns):
+    #     plt.subplot(2, 2, i%4+1)
+    #     plt.scatter(X[col], yy)
+    #     plt.xlabel(col)
+    #     plt.ylabel(y)
+    #     plt.title(col)
+    #     if i % 4 == 3  or i == len(X.columns)-1:
+    #         plt.tight_layout()
+    #         plt.savefig(os.path.join(dataset,f'{i}指标数据特征与价格散点图.png'))
+    #         plt.close()

    

--- a/main.py
+++ b/main.py
@ -1,10 +1,10 @@
 # 读取配置
-from config_jingbo import *
+# from config_jingbo import *
 # from config_tansuanli import *
-# from config_juxiting import *
+from config_juxiting import *
 from lib.dataread import *
 from lib.tools import *
-from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf
+from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf,model_losss_juxiting

 import glob
 import torch
@ -118,36 +118,36 @@ def predict_main():
    row,col = df.shape
    
    now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
-    ex_Model(df,
-            horizon=horizon,
-            input_size=input_size,
-            train_steps=train_steps,
-            val_check_steps=val_check_steps,
-            early_stop_patience_steps=early_stop_patience_steps,
-            is_debug=is_debug,
-            dataset=dataset,
-            is_train=is_train,
-            is_fivemodels=is_fivemodels,
-            val_size=val_size,
-            test_size=test_size,
-            settings=settings,
-            now=now,
-            etadata = etadata,
-            modelsindex = modelsindex,
-            data = data,
-            is_eta=is_eta,
-            )
+    # ex_Model(df,
+    #         horizon=horizon,
+    #         input_size=input_size,
+    #         train_steps=train_steps,
+    #         val_check_steps=val_check_steps,
+    #         early_stop_patience_steps=early_stop_patience_steps,
+    #         is_debug=is_debug,
+    #         dataset=dataset,
+    #         is_train=is_train,
+    #         is_fivemodels=is_fivemodels,
+    #         val_size=val_size,
+    #         test_size=test_size,
+    #         settings=settings,
+    #         now=now,
+    #         etadata = etadata,
+    #         modelsindex = modelsindex,
+    #         data = data,
+    #         is_eta=is_eta,
+    #         )

    # # 模型评估
-    model_results3 = model_losss(sqlitedb)
+    model_results3 = model_losss_juxiting(sqlitedb)
    # 模型报告
    
-    title = f'{settings}--{now}-预测报告' # 报告标题
-    brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
-                reportname=reportname,sqlitedb=sqlitedb),
-    # pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
-                # reportname=reportname),
-    logger.info('模型训练完成')
+    # title = f'{settings}--{now}-预测报告' # 报告标题
+    # brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
+    #             reportname=reportname,sqlitedb=sqlitedb),
+    # # pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time,
+    #             # reportname=reportname),
+    # logger.info('模型训练完成')
    
    # tansuanli_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,end_time=end_time,reportname=reportname)
    
--- a/models/nerulforcastmodels.py
+++ b/models/nerulforcastmodels.py
@ -396,6 +396,301 @@ def model_losss(sqlitedb):

    

+    # 计算每个预测值与真实值之间的偏差率
+    for model in allmodelnames:
+        df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y']
+
+    # 获取每行对应的最小偏差率值
+    min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1)
+    # 获取每行对应的最小偏差率值对应的列名
+    min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1)
+    # 将列名索引转换为列名
+    min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0])
+    # 获取最小偏差率对应的模型的预测值
+    min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1)
+    # 将最小偏差率对应的模型的预测值添加到DataFrame中
+    df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions
+    df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name
+    df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)
+    # 判断 df 的数值列转为float
+    for col in df_combined3.columns:
+        try:
+            if col != 'ds':
+                df_combined3[col] = df_combined3[col].astype(float)
+                df_combined3[col] = df_combined3[col].round(2)
+        except ValueError:
+            pass
+    df_combined3.to_csv(os.path.join(dataset,"df_combined3.csv"),index=False) 
+    
+     # 历史价格+预测价格
+    df_combined3 = df_combined3[-50:] # 取50个数据点画图
+    # 历史价格
+    plt.figure(figsize=(20, 10))
+    plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
+    # 颜色填充
+    plt.fill_between(df_combined3['ds'], df_combined3['min_within_quantile'], df_combined3['max_within_quantile'], alpha=0.2)
+    # plt.plot(df_combined3['ds'], df_combined3['min_abs_error_rate_prediction'], label='最小绝对误差', linestyle='--', color='orange')
+    # 网格
+    plt.grid(True)
+    # 显示历史值
+    for i, j in zip(df_combined3['ds'], df_combined3['y']):
+        plt.text(i, j, str(j), ha='center', va='bottom')
+
+    # 数据库查询最佳模型名称
+    most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]]
+
+    for model in most_model:
+        plt.plot(df_combined3['ds'], df_combined3[model], label=model,marker='o')
+    # 当前日期画竖虚线
+    plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--')
+    plt.legend()
+    plt.xlabel('日期')
+    plt.ylabel('价格')
+    
+    plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')
+    plt.close()
+       
+    # 预测值表格
+    fig, ax = plt.subplots(figsize=(20, 6))
+    ax.axis('off')  # 关闭坐标轴
+    # 数值保留2位小数
+    df_combined3 = df_combined3.round(2)
+    df_combined3 = df_combined3[-horizon:]
+    df_combined3['Day'] = [f'Day_{i}' for i in range(1,horizon+1)]
+    # Day列放到最前面
+    df_combined3 = df_combined3[['Day'] + list(df_combined3.columns[:-1])]
+    table = ax.table(cellText=df_combined3.values, colLabels=df_combined3.columns, loc='center')
+    #加宽表格
+    table.auto_set_font_size(False)
+    table.set_fontsize(10)
+
+    # 设置表格样式，列数据最小的用绿色标识
+    plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight')
+    plt.close()
+    # plt.show()
+       
+    # 可视化评估结果
+    plt.rcParams['font.sans-serif'] = ['SimHei']
+    fig, ax = plt.subplots(figsize=(20, 10))
+    ax.axis('off')  # 关闭坐标轴
+    table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center')
+    # 加宽表格
+    table.auto_set_font_size(False)
+    table.set_fontsize(10)
+
+    # 设置表格样式，列数据最小的用绿色标识
+    plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight')
+    plt.close()
+    return model_results3
+
+# 计算预测评估指数
+def model_losss_juxiting(sqlitedb):
+    global dataset
+    # 预测数据处理 predict
+    df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv"))  
+    df_combined = dateConvert(df_combined)
+    # 删除空列
+    df_combined.dropna(axis=1,inplace=True)
+     # 删除缺失值,预测过程不能有缺失值
+    df_combined.dropna(inplace=True) 
+    # 其他列转为数值类型
+    df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] })
+    # 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值，并创建一个新的列来存储这个最大值
+    df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max')
+
+    # 然后筛选出那些 cutoff 等于 max_cutoff 的行，这样就得到了每个分组中 cutoff 最大的行，并保留了其他列
+    df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']]
+    # 删除模型生成的cutoff列
+    df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True)
+    # 获取模型名称
+    modelnames  = df_combined.columns.to_list()[2:] 
+    if 'y' in modelnames:
+        modelnames.remove('y')
+    df_combined3 = df_combined.copy()  # 备份df_combined,后面画图需要
+
+    # 每行预测值找到10%分位数和90%分位数
+    def find_min_max_within_quantile(row):
+        row.drop(['ds','y'], inplace=True)
+        # 获取分位数10%和90%的值
+        q10 = row.quantile(0.1)
+        q90 = row.quantile(0.9)
+        # 获取 row行10%分位值对应的模型名称
+        min_model = row[row == q10].idxmin()
+        max_model = row[row == q90].idxmin()
+        
+        # # 判断flot值是否为空值
+        # if pd.isna(q10) or pd.isna(q90):
+        return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
+
+    # 遍历行
+    df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
+    df_combined = df_combined.round(4)
+    print(df_combined3)
+    
+    
+
+
+
+
+
+
+
+
+
+
+    # # 计算波动率
+    # df_combined3['volatility'] = df_combined3['y'].pct_change().round(4)
+    # # 计算近60日的波动率 10% 90%分位数
+    # df_combined3['quantile_10'] = df_combined3['volatility'].rolling(60).quantile(0.1)
+    # df_combined3['quantile_90'] = df_combined3['volatility'].rolling(60).quantile(0.9)
+    # df_combined3 = df_combined3.round(4)
+    # # 计算分位数对应的价格
+    # df_combined3['quantile_10_price'] = df_combined3['y'] * (1 + df_combined3['quantile_10'])
+    # df_combined3['quantile_90_price'] = df_combined3['y'] * (1 + df_combined3['quantile_90'])
+
+    # # 遍历行
+    # def find_min_max_within_quantile(row):
+    #     # 获取分位数10%和90%的值
+    #     q10 = row['quantile_10_price']
+    #     q90 = row['quantile_90_price']
+        
+    #     # 判断flot值是否为空值
+    #     if pd.isna(q10) or pd.isna(q90):
+    #         return pd.Series([None, None, None, None], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
+        
+    #     # 初始化最小和最大值为None
+    #     min_value = None
+    #     max_value = None
+    #     min_value_model = ''
+    #     max_value_model = ''
+
+        
+    #     # 遍历指定列，找出在分位数范围内的最大最小值
+    #     for model in modelnames:
+    #         value = row[model]
+    #         if value >= q10 and value <= q90:
+    #             if min_value is None or value < min_value:
+    #                 min_value = value
+    #                 min_value_model = model
+
+    #             if max_value is None or value > max_value:
+    #                 max_value = value
+    #                 max_value_model = model
+        
+    #     # 返回最大最小值
+    #     return pd.Series([min_value, max_value,min_value_model,max_value_model], index=['min_within_quantile', 'max_within_quantile','min_model','max_model'])
+
+    # # 应用函数到每一行
+    # df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
+
+    # 去除有空值的行
+    df_combined3.dropna(inplace=True)
+    # 保存到数据库
+    df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)
+    df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
+
+
+    # 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE
+    cellText = []
+
+    # 遍历模型名称，计算模型评估指标  
+    for model in modelnames:
+        modelmse = mse(df_combined['y'], df_combined[model])
+        modelrmse = rmse(df_combined['y'], df_combined[model])
+        modelmae = mae(df_combined['y'], df_combined[model])
+        # modelmape = mape(df_combined['y'], df_combined[model])
+        # modelsmape = smape(df_combined['y'], df_combined[model])
+        # modelr2 = r2_score(df_combined['y'], df_combined[model])
+        cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)])
+        
+    model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)',  '平均绝对误差(MAE)'])
+    # 按MSE降序排列
+    model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True)
+    model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False)
+    modelnames = model_results3['模型(Model)'].tolist()
+    allmodelnames = modelnames.copy()
+    # 保存5个最佳模型的名称
+    if len(modelnames) > 5:
+        modelnames = modelnames[0:5]
+    with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f:
+        f.write(','.join(modelnames) + '\n')
+    
+    # 预测值与真实值对比图
+    plt.rcParams['font.sans-serif'] = ['SimHei']
+    plt.figure(figsize=(15, 10))
+    # 设置有5个子图的画布
+    for n,model in enumerate(modelnames):
+        plt.subplot(3, 2, n+1)
+        plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
+        plt.plot(df_combined3['ds'], df_combined3[model], label=model)
+        plt.legend()
+        plt.xlabel('日期')
+        plt.ylabel('价格')
+        plt.title(model+'拟合')
+    plt.subplots_adjust(hspace=0.5)
+    plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight')
+    plt.close()
+    
+    # 历史数据+预测数据
+    # 拼接未来时间预测
+    df_predict  = loadcsv(os.path.join(dataset,'predict.csv'))
+    df_predict.drop('unique_id',inplace=True,axis=1)
+    df_predict.dropna(axis=1,inplace=True)
+
+    try:
+        df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')
+    except ValueError :
+        df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')
+
+    # 取第一行数据存储到数据库中
+    first_row = df_predict.head(1)
+    first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')
+    # 将预测结果保存到数据库
+    if not sqlitedb.check_table_exists('trueandpredict'):
+        first_row.to_sql('trueandpredict',sqlitedb.connection,index=False)
+    else:
+        for row in first_row.itertuples(index=False):
+            row_dict = row._asdict()
+            columns=row_dict.keys()
+            for col in columns:
+                sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT')
+            check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'")
+            if len(check_query) > 0:
+                set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
+                sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'")
+                continue
+            sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns)
+
+    
+
+    # 最多频率的模型名称
+    min_model_max_frequency_model = df_combined3['min_model'].value_counts().idxmax()
+    max_model_max_frequency_model = df_combined3['max_model'].value_counts().idxmax()
+    df_predict['min_model'] = min_model_max_frequency_model
+    df_predict['max_model'] = max_model_max_frequency_model
+    df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model]
+    df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model]
+    
+    df_predict2 = df_predict.copy()
+    df_predict2['ds'] = df_predict2['ds'].dt.strftime('%Y-%m-%d 00:00:00')
+
+
+    # 将预测结果保存到数据库
+    # 判断表存在
+    if not sqlitedb.check_table_exists('testandpredict_groupby'):
+        df_predict2.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)
+    else:
+        for row in df_predict2.itertuples(index=False):
+            row_dict = row._asdict()
+            check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f"ds = '{row.ds}'")
+            if len(check_query) > 0:
+                set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
+                sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f"ds = '{row.ds}'")
+                continue
+            sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())
+
+    
+
    # 计算每个预测值与真实值之间的偏差率
    for model in allmodelnames:
        df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y']