聚烯烃月度预测数据处理

2025-08-11 16:16:52 +08:00 · 2025-08-11 16:16:52 +08:00 · 8a2cf77639
commit 8a2cf77639
parent b8ca24e07b
5 changed files with 112 additions and 80 deletions
--- a/config_juxiting.py
+++ b/config_juxiting.py
@ -434,7 +434,7 @@ DEFAULT_CONFIG = {
 # 开关
 is_train = True  # 是否训练
 is_debug = False  # 是否调试
-is_eta = True  # 是否使用eta接口
+is_eta = False  # 是否使用eta接口
 is_market = True  # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
 is_timefurture = True  # 是否使用时间特征
 is_fivemodels = False  # 是否使用之前保存的最佳的5个模型
--- a/config_juxiting_yuedu.py
+++ b/config_juxiting_yuedu.py
@ -460,7 +460,7 @@ DEFAULT_CONFIG = {
 # 开关
 is_train = True  # 是否训练
 is_debug = False  # 是否调试
-is_eta = True  # 是否使用eta接口
+is_eta = False  # 是否使用eta接口
 is_market = True  # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
 is_timefurture = True  # 是否使用时间特征
 is_fivemodels = False  # 是否使用之前保存的最佳的5个模型
--- a/lib/dataread.py
+++ b/lib/dataread.py
@ -568,44 +568,66 @@ def feature_importance(X_train, y_train):
    temp = XGBRegressor()
    temp.fit(X_train, y_train)

-    ax = plot_importance(temp)
-    fig = ax.figure
-    fig.set_size_inches(8, 7)
+    # 获取特征重要性
+    importances = temp.feature_importances_
+    indices = np.argsort(importances)[::-1]  # 按重要性降序排列

-    # 修改图的标题，添加模型名称
-    title = '特征重要度1'  # 替换为你的模型名称
-    ax.set_title(title)
+    # 获取前10个特征的索引和重要性
+    top_indices = indices[:10]
+    top_importances = importances[top_indices]
+    top_features = [X_train.columns[i] for i in top_indices]

+    # 计算百分比（相对于所有特征的总和）
+    total_all_importance = sum(importances)  # 所有特征的重要性总和
+    percentages = (top_importances / total_all_importance) * 100
+
+    # 绘制特征重要性图
+    plt.figure(figsize=(10, 8))
+    ax = plt.gca()
+
+    # 绘制条形图
+    for i, (importance, percentage) in enumerate(zip(top_importances, percentages)):
+        ax.barh(i, importance, color='skyblue')
+        ax.text(importance + 0.01, i, f'{percentage:.1f}%', va='center')
+
+    # 设置y轴标签和标题
+    ax.set_yticks(range(len(top_features)))
+    ax.set_yticklabels(top_features)
+    ax.set_xlabel('特征重要性')
+    ax.set_title('特征重要性排序（前10位）')
+
+    # 调整布局并显示
+    plt.tight_layout()
    # plt.show()
    # 保存图片
    plt.savefig(os.path.join(config.dataset, '特征重要度1.png'))
    plt.close()
    config.logger.info('特征重要度1.png 已保存')

-    # 创建一个 LGBMRegressor 对象并训练模型
-    regressor = lgb.LGBMRegressor()
-    regressor.fit(X_train, y_train)
+    # # 创建一个 LGBMRegressor 对象并训练模型
+    # regressor = lgb.LGBMRegressor()
+    # regressor.fit(X_train, y_train)

-    # 设置图形大小（可选）
-    plt.figure(figsize=(30, 40))
-    # 使用 plot_importance 函数来绘制特征重要性
-    # 注意：在一些版本的 LightGBM 中，你可以直接传入模型对象
-    ax = importance_plot = lgb.plot_importance(
-        regressor, importance_type='gain')  # 或者 'split'
-    # 设置标题和字体大小
-    ax.set_title('Feature Importance - LGBMRegressor', fontsize=12)
-    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
-                 ax.get_xticklabels() + ax.get_yticklabels()):
-        item.set_fontsize(9)  # 设置 x 轴、y 轴标签以及刻度标签的字体大小
-    # 修改图的标题，添加模型名称
-    # title = 'Feature Importance - LGBMRegressor '  # 替换为你的模型名称
-    # ax.set_title(title)
-    # 保存图片
-    plt.savefig(os.path.join(config.dataset, '特征重要度2.png'))
+    # # 设置图形大小（可选）
+    # plt.figure(figsize=(30, 40))
+    # # 使用 plot_importance 函数来绘制特征重要性
+    # # 注意：在一些版本的 LightGBM 中，你可以直接传入模型对象
+    # ax = importance_plot = lgb.plot_importance(
+    #     regressor, importance_type='gain')  # 或者 'split'
+    # # 设置标题和字体大小
+    # ax.set_title('Feature Importance - LGBMRegressor', fontsize=12)
+    # for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
+    #              ax.get_xticklabels() + ax.get_yticklabels()):
+    #     item.set_fontsize(9)  # 设置 x 轴、y 轴标签以及刻度标签的字体大小
+    # # 修改图的标题，添加模型名称
+    # # title = 'Feature Importance - LGBMRegressor '  # 替换为你的模型名称
+    # # ax.set_title(title)
+    # # 保存图片
+    # plt.savefig(os.path.join(config.dataset, '特征重要度2.png'))

-    # 显示图形
-    plt.close()
-    config.logger.info('特征重要度2.png 已保存')
+    # # 显示图形
+    # plt.close()
+    # config.logger.info('特征重要度2.png 已保存')


 def corr_feature(df):
@ -807,7 +829,7 @@ def calculate_kdj(data, n=9):

 def calculate_correlation(df):
    try:
-        yy = df['y']
+        yy = df['y'][-30:]
        # 去掉ds y
        df = df.drop(columns=['ds', 'y'])
        # 计算相关系数
@ -1088,25 +1110,26 @@ def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_t
    df = df[df['ds'] <= end_time]
    config.logger.info(f'删除两月不更新特征前数据量：{df.shape}')
    # 去掉近最后数据对应的日期在两月以前的列，删除近2月的数据是常数的列
-    # current_date = datetime.datetime.now()
-    # two_months_ago = current_date - timedelta(days=40)
+    current_date = datetime.datetime.strptime(
+        global_config['end_time'], '%Y-%m-%d')
+    two_months_ago = current_date - timedelta(days=40)
    # 检查两月不更新的特征

-    # def check_column(col_name):
-    #     if 'ds' in col_name or 'y' in col_name:
-    #         return False
-    #     df_check_column = df[['ds', col_name]]
-    #     df_check_column = df_check_column.dropna()
-    #     if len(df_check_column) == 0:
-    #         return True
-    #     if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2:
-    #         return True
-    #     corresponding_date = df_check_column.iloc[-1]['ds']
-    #     return corresponding_date < two_months_ago
-    # columns_to_drop = df.columns[df.columns.map(check_column)].tolist()
-    # df = df.drop(columns=columns_to_drop)
+    def check_column(col_name):
+        if 'ds' in col_name or 'y' in col_name:
+            return False
+        df_check_column = df[['ds', col_name]]
+        df_check_column = df_check_column.dropna()
+        if len(df_check_column) == 0:
+            return True
+        if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2:
+            return True
+        corresponding_date = df_check_column.iloc[-1]['ds']
+        return corresponding_date < two_months_ago
+    columns_to_drop = df.columns[df.columns.map(check_column)].tolist()
+    df = df.drop(columns=columns_to_drop)

-    # config.logger.info(f'删除两月不更新特征后数据量：{df.shape}')
+    config.logger.info(f'删除两月不更新特征后数据量：{df.shape}')

    # 衍生时间特征
    if is_timefurture:
--- a/main_juxiting_yuedu.py
+++ b/main_juxiting_yuedu.py
@ -94,26 +94,32 @@ global_config.update({
 def push_market_value():
    logger.info('发送预测结果到市场信息平台')
    current_end_time = global_config['end_time']
-    previous_trading_day = (pd.Timestamp(current_end_time) - 
-                           pd.tseries.offsets.BusinessDay(1)).strftime('%Y-%m-%d')
+    previous_trading_day = (pd.Timestamp(current_end_time) -
+                            pd.tseries.offsets.BusinessDay(1)).strftime('%Y-%m-%d')

    # 读取预测数据和模型评估数据
    best_bdwd_price = find_best_models(
        date=previous_trading_day, global_config=global_config)
-    
-     # 获取本月最佳模型的预测价格
-    four_month_predict_price = pd.read_csv(os.path.join(global_config['dataset'], 'predict.csv'))
-    four_month_predict_price['ds'] = pd.to_datetime(four_month_predict_price['ds'])
+
+    # 获取本月最佳模型的预测价格
+    four_month_predict_price = pd.read_csv(
+        os.path.join(global_config['dataset'], 'predict.csv'))
+    four_month_predict_price['ds'] = pd.to_datetime(
+        four_month_predict_price['ds'])
    # 设置索引 次月 次二月 次三月 次四月
    index_labels = ["次月", "次二月", "次三月", "次四月"]
-    four_month_predict_price.index = index_labels 
+    four_month_predict_price.index = index_labels
    global_config['logger'].info(f"best_bdwd_price: {best_bdwd_price}")

    # 准备要推送的数据
-    ciyue_mean = four_month_predict_price[best_bdwd_price['next_month_price']['model_name']].iloc[0]
-    cieryue_mean = four_month_predict_price[best_bdwd_price['next_february_price']['model_name']].iloc[1]
-    cisanyue_mean = four_month_predict_price[best_bdwd_price['next_march_price']['model_name']].iloc[2]
-    cisieryue_mean = four_month_predict_price[best_bdwd_price['next_april_price']['model_name']].iloc[3]
+    ciyue_mean = four_month_predict_price[best_bdwd_price['next_month_price']
+                                          ['model_name']].iloc[0]
+    cieryue_mean = four_month_predict_price[best_bdwd_price['next_february_price']
+                                            ['model_name']].iloc[1]
+    cisanyue_mean = four_month_predict_price[best_bdwd_price['next_march_price']
+                                             ['model_name']].iloc[2]
+    cisieryue_mean = four_month_predict_price[best_bdwd_price['next_april_price']
+                                              ['model_name']].iloc[3]
    # # 保留两位小数
    ciyue_mean = round(ciyue_mean, 2)
    cieryue_mean = round(cieryue_mean, 2)
@ -331,8 +337,8 @@ def predict_main():
    else:
        # 读取数据
        logger.info('读取本地数据：' + os.path.join(dataset, data_set))
-        df, df_zhibiaoliebiao = getdata_zhoudu_juxiting(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj,
-                                                        is_timefurture=is_timefurture, end_time=end_time)  # 原始数据，未处理
+        df, df_zhibiaoliebiao = getdata_juxiting(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj,
+                                                 is_timefurture=is_timefurture, end_time=end_time)  # 原始数据，未处理

    # 更改预测列名称
    df.rename(columns={y: 'y'}, inplace=True)
@ -450,25 +456,25 @@ def predict_main():

    now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    ex_Model_Juxiting(df,
-             horizon=global_config['horizon'],
-             input_size=global_config['input_size'],
-             train_steps=global_config['train_steps'],
-             val_check_steps=global_config['val_check_steps'],
-             early_stop_patience_steps=global_config['early_stop_patience_steps'],
-             is_debug=global_config['is_debug'],
-             dataset=global_config['dataset'],
-             is_train=global_config['is_train'],
-             is_fivemodels=global_config['is_fivemodels'],
-             val_size=global_config['val_size'],
-             test_size=global_config['test_size'],
-             settings=global_config['settings'],
-             now=now,
-             etadata=etadata,
-             modelsindex=global_config['modelsindex'],
-             data=data,
-             is_eta=global_config['is_eta'],
-             end_time=global_config['end_time'],
-             )
+                      horizon=global_config['horizon'],
+                      input_size=global_config['input_size'],
+                      train_steps=global_config['train_steps'],
+                      val_check_steps=global_config['val_check_steps'],
+                      early_stop_patience_steps=global_config['early_stop_patience_steps'],
+                      is_debug=global_config['is_debug'],
+                      dataset=global_config['dataset'],
+                      is_train=global_config['is_train'],
+                      is_fivemodels=global_config['is_fivemodels'],
+                      val_size=global_config['val_size'],
+                      test_size=global_config['test_size'],
+                      settings=global_config['settings'],
+                      now=now,
+                      etadata=etadata,
+                      modelsindex=global_config['modelsindex'],
+                      data=data,
+                      is_eta=global_config['is_eta'],
+                      end_time=global_config['end_time'],
+                      )

    logger.info('模型训练完成')

--- a/models/nerulforcastmodels.py
+++ b/models/nerulforcastmodels.py
@ -350,7 +350,10 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear
    config.logger.info(df_test.head())

    # 特征重要度
-    X_train = df_train.drop(columns=['y'])
+    X_train = df_train.drop(columns=['y', 'ds'])
+    if 'yearmonthweeks' in df_train.columns:
+        X_train = df_train.drop(columns=['yearmonthweeks'])
+
    y_train = df_train['y']
    feature_importance(X_train=X_train, y_train=y_train)