聚烯烃月度预测数据处理

This commit is contained in:
workpc 2025-08-11 16:16:52 +08:00
parent b8ca24e07b
commit 8a2cf77639
5 changed files with 112 additions and 80 deletions

View File

@ -434,7 +434,7 @@ DEFAULT_CONFIG = {
# 开关
is_train = True # 是否训练
is_debug = False # 是否调试
is_eta = True # 是否使用eta接口
is_eta = False # 是否使用eta接口
is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
is_timefurture = True # 是否使用时间特征
is_fivemodels = False # 是否使用之前保存的最佳的5个模型

View File

@ -460,7 +460,7 @@ DEFAULT_CONFIG = {
# 开关
is_train = True # 是否训练
is_debug = False # 是否调试
is_eta = True # 是否使用eta接口
is_eta = False # 是否使用eta接口
is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
is_timefurture = True # 是否使用时间特征
is_fivemodels = False # 是否使用之前保存的最佳的5个模型

View File

@ -568,44 +568,66 @@ def feature_importance(X_train, y_train):
temp = XGBRegressor()
temp.fit(X_train, y_train)
ax = plot_importance(temp)
fig = ax.figure
fig.set_size_inches(8, 7)
# 获取特征重要性
importances = temp.feature_importances_
indices = np.argsort(importances)[::-1] # 按重要性降序排列
# 修改图的标题,添加模型名称
title = '特征重要度1' # 替换为你的模型名称
ax.set_title(title)
# 获取前10个特征的索引和重要性
top_indices = indices[:10]
top_importances = importances[top_indices]
top_features = [X_train.columns[i] for i in top_indices]
# 计算百分比(相对于所有特征的总和)
total_all_importance = sum(importances) # 所有特征的重要性总和
percentages = (top_importances / total_all_importance) * 100
# 绘制特征重要性图
plt.figure(figsize=(10, 8))
ax = plt.gca()
# 绘制条形图
for i, (importance, percentage) in enumerate(zip(top_importances, percentages)):
ax.barh(i, importance, color='skyblue')
ax.text(importance + 0.01, i, f'{percentage:.1f}%', va='center')
# 设置y轴标签和标题
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features)
ax.set_xlabel('特征重要性')
ax.set_title('特征重要性排序前10位')
# 调整布局并显示
plt.tight_layout()
# plt.show()
# 保存图片
plt.savefig(os.path.join(config.dataset, '特征重要度1.png'))
plt.close()
config.logger.info('特征重要度1.png 已保存')
# 创建一个 LGBMRegressor 对象并训练模型
regressor = lgb.LGBMRegressor()
regressor.fit(X_train, y_train)
# # 创建一个 LGBMRegressor 对象并训练模型
# regressor = lgb.LGBMRegressor()
# regressor.fit(X_train, y_train)
# 设置图形大小(可选)
plt.figure(figsize=(30, 40))
# 使用 plot_importance 函数来绘制特征重要性
# 注意:在一些版本的 LightGBM 中,你可以直接传入模型对象
ax = importance_plot = lgb.plot_importance(
regressor, importance_type='gain') # 或者 'split'
# 设置标题和字体大小
ax.set_title('Feature Importance - LGBMRegressor', fontsize=12)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(9) # 设置 x 轴、y 轴标签以及刻度标签的字体大小
# 修改图的标题,添加模型名称
# title = 'Feature Importance - LGBMRegressor ' # 替换为你的模型名称
# ax.set_title(title)
# 保存图片
plt.savefig(os.path.join(config.dataset, '特征重要度2.png'))
# # 设置图形大小(可选)
# plt.figure(figsize=(30, 40))
# # 使用 plot_importance 函数来绘制特征重要性
# # 注意:在一些版本的 LightGBM 中,你可以直接传入模型对象
# ax = importance_plot = lgb.plot_importance(
# regressor, importance_type='gain') # 或者 'split'
# # 设置标题和字体大小
# ax.set_title('Feature Importance - LGBMRegressor', fontsize=12)
# for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
# ax.get_xticklabels() + ax.get_yticklabels()):
# item.set_fontsize(9) # 设置 x 轴、y 轴标签以及刻度标签的字体大小
# # 修改图的标题,添加模型名称
# # title = 'Feature Importance - LGBMRegressor ' # 替换为你的模型名称
# # ax.set_title(title)
# # 保存图片
# plt.savefig(os.path.join(config.dataset, '特征重要度2.png'))
# 显示图形
plt.close()
config.logger.info('特征重要度2.png 已保存')
# # 显示图形
# plt.close()
# config.logger.info('特征重要度2.png 已保存')
def corr_feature(df):
@ -807,7 +829,7 @@ def calculate_kdj(data, n=9):
def calculate_correlation(df):
try:
yy = df['y']
yy = df['y'][-30:]
# 去掉ds y
df = df.drop(columns=['ds', 'y'])
# 计算相关系数
@ -1088,25 +1110,26 @@ def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_t
df = df[df['ds'] <= end_time]
config.logger.info(f'删除两月不更新特征前数据量:{df.shape}')
# 去掉近最后数据对应的日期在两月以前的列删除近2月的数据是常数的列
# current_date = datetime.datetime.now()
# two_months_ago = current_date - timedelta(days=40)
current_date = datetime.datetime.strptime(
global_config['end_time'], '%Y-%m-%d')
two_months_ago = current_date - timedelta(days=40)
# 检查两月不更新的特征
# def check_column(col_name):
# if 'ds' in col_name or 'y' in col_name:
# return False
# df_check_column = df[['ds', col_name]]
# df_check_column = df_check_column.dropna()
# if len(df_check_column) == 0:
# return True
# if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2:
# return True
# corresponding_date = df_check_column.iloc[-1]['ds']
# return corresponding_date < two_months_ago
# columns_to_drop = df.columns[df.columns.map(check_column)].tolist()
# df = df.drop(columns=columns_to_drop)
def check_column(col_name):
if 'ds' in col_name or 'y' in col_name:
return False
df_check_column = df[['ds', col_name]]
df_check_column = df_check_column.dropna()
if len(df_check_column) == 0:
return True
if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2:
return True
corresponding_date = df_check_column.iloc[-1]['ds']
return corresponding_date < two_months_ago
columns_to_drop = df.columns[df.columns.map(check_column)].tolist()
df = df.drop(columns=columns_to_drop)
# config.logger.info(f'删除两月不更新特征后数据量:{df.shape}')
config.logger.info(f'删除两月不更新特征后数据量:{df.shape}')
# 衍生时间特征
if is_timefurture:

View File

@ -94,26 +94,32 @@ global_config.update({
def push_market_value():
logger.info('发送预测结果到市场信息平台')
current_end_time = global_config['end_time']
previous_trading_day = (pd.Timestamp(current_end_time) -
pd.tseries.offsets.BusinessDay(1)).strftime('%Y-%m-%d')
previous_trading_day = (pd.Timestamp(current_end_time) -
pd.tseries.offsets.BusinessDay(1)).strftime('%Y-%m-%d')
# 读取预测数据和模型评估数据
best_bdwd_price = find_best_models(
date=previous_trading_day, global_config=global_config)
# 获取本月最佳模型的预测价格
four_month_predict_price = pd.read_csv(os.path.join(global_config['dataset'], 'predict.csv'))
four_month_predict_price['ds'] = pd.to_datetime(four_month_predict_price['ds'])
# 获取本月最佳模型的预测价格
four_month_predict_price = pd.read_csv(
os.path.join(global_config['dataset'], 'predict.csv'))
four_month_predict_price['ds'] = pd.to_datetime(
four_month_predict_price['ds'])
# 设置索引 次月 次二月 次三月 次四月
index_labels = ["次月", "次二月", "次三月", "次四月"]
four_month_predict_price.index = index_labels
four_month_predict_price.index = index_labels
global_config['logger'].info(f"best_bdwd_price: {best_bdwd_price}")
# 准备要推送的数据
ciyue_mean = four_month_predict_price[best_bdwd_price['next_month_price']['model_name']].iloc[0]
cieryue_mean = four_month_predict_price[best_bdwd_price['next_february_price']['model_name']].iloc[1]
cisanyue_mean = four_month_predict_price[best_bdwd_price['next_march_price']['model_name']].iloc[2]
cisieryue_mean = four_month_predict_price[best_bdwd_price['next_april_price']['model_name']].iloc[3]
ciyue_mean = four_month_predict_price[best_bdwd_price['next_month_price']
['model_name']].iloc[0]
cieryue_mean = four_month_predict_price[best_bdwd_price['next_february_price']
['model_name']].iloc[1]
cisanyue_mean = four_month_predict_price[best_bdwd_price['next_march_price']
['model_name']].iloc[2]
cisieryue_mean = four_month_predict_price[best_bdwd_price['next_april_price']
['model_name']].iloc[3]
# # 保留两位小数
ciyue_mean = round(ciyue_mean, 2)
cieryue_mean = round(cieryue_mean, 2)
@ -331,8 +337,8 @@ def predict_main():
else:
# 读取数据
logger.info('读取本地数据:' + os.path.join(dataset, data_set))
df, df_zhibiaoliebiao = getdata_zhoudu_juxiting(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj,
is_timefurture=is_timefurture, end_time=end_time) # 原始数据,未处理
df, df_zhibiaoliebiao = getdata_juxiting(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj,
is_timefurture=is_timefurture, end_time=end_time) # 原始数据,未处理
# 更改预测列名称
df.rename(columns={y: 'y'}, inplace=True)
@ -450,25 +456,25 @@ def predict_main():
now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
ex_Model_Juxiting(df,
horizon=global_config['horizon'],
input_size=global_config['input_size'],
train_steps=global_config['train_steps'],
val_check_steps=global_config['val_check_steps'],
early_stop_patience_steps=global_config['early_stop_patience_steps'],
is_debug=global_config['is_debug'],
dataset=global_config['dataset'],
is_train=global_config['is_train'],
is_fivemodels=global_config['is_fivemodels'],
val_size=global_config['val_size'],
test_size=global_config['test_size'],
settings=global_config['settings'],
now=now,
etadata=etadata,
modelsindex=global_config['modelsindex'],
data=data,
is_eta=global_config['is_eta'],
end_time=global_config['end_time'],
)
horizon=global_config['horizon'],
input_size=global_config['input_size'],
train_steps=global_config['train_steps'],
val_check_steps=global_config['val_check_steps'],
early_stop_patience_steps=global_config['early_stop_patience_steps'],
is_debug=global_config['is_debug'],
dataset=global_config['dataset'],
is_train=global_config['is_train'],
is_fivemodels=global_config['is_fivemodels'],
val_size=global_config['val_size'],
test_size=global_config['test_size'],
settings=global_config['settings'],
now=now,
etadata=etadata,
modelsindex=global_config['modelsindex'],
data=data,
is_eta=global_config['is_eta'],
end_time=global_config['end_time'],
)
logger.info('模型训练完成')

View File

@ -350,7 +350,10 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear
config.logger.info(df_test.head())
# 特征重要度
X_train = df_train.drop(columns=['y'])
X_train = df_train.drop(columns=['y', 'ds'])
if 'yearmonthweeks' in df_train.columns:
X_train = df_train.drop(columns=['yearmonthweeks'])
y_train = df_train['y']
feature_importance(X_train=X_train, y_train=y_train)