聚烯烃月度预测数据处理
This commit is contained in:
parent
b8ca24e07b
commit
8a2cf77639
@ -434,7 +434,7 @@ DEFAULT_CONFIG = {
|
||||
# 开关
|
||||
is_train = True # 是否训练
|
||||
is_debug = False # 是否调试
|
||||
is_eta = True # 是否使用eta接口
|
||||
is_eta = False # 是否使用eta接口
|
||||
is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
|
||||
is_timefurture = True # 是否使用时间特征
|
||||
is_fivemodels = False # 是否使用之前保存的最佳的5个模型
|
||||
|
@ -460,7 +460,7 @@ DEFAULT_CONFIG = {
|
||||
# 开关
|
||||
is_train = True # 是否训练
|
||||
is_debug = False # 是否调试
|
||||
is_eta = True # 是否使用eta接口
|
||||
is_eta = False # 是否使用eta接口
|
||||
is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
|
||||
is_timefurture = True # 是否使用时间特征
|
||||
is_fivemodels = False # 是否使用之前保存的最佳的5个模型
|
||||
|
113
lib/dataread.py
113
lib/dataread.py
@ -568,44 +568,66 @@ def feature_importance(X_train, y_train):
|
||||
temp = XGBRegressor()
|
||||
temp.fit(X_train, y_train)
|
||||
|
||||
ax = plot_importance(temp)
|
||||
fig = ax.figure
|
||||
fig.set_size_inches(8, 7)
|
||||
# 获取特征重要性
|
||||
importances = temp.feature_importances_
|
||||
indices = np.argsort(importances)[::-1] # 按重要性降序排列
|
||||
|
||||
# 修改图的标题,添加模型名称
|
||||
title = '特征重要度1' # 替换为你的模型名称
|
||||
ax.set_title(title)
|
||||
# 获取前10个特征的索引和重要性
|
||||
top_indices = indices[:10]
|
||||
top_importances = importances[top_indices]
|
||||
top_features = [X_train.columns[i] for i in top_indices]
|
||||
|
||||
# 计算百分比(相对于所有特征的总和)
|
||||
total_all_importance = sum(importances) # 所有特征的重要性总和
|
||||
percentages = (top_importances / total_all_importance) * 100
|
||||
|
||||
# 绘制特征重要性图
|
||||
plt.figure(figsize=(10, 8))
|
||||
ax = plt.gca()
|
||||
|
||||
# 绘制条形图
|
||||
for i, (importance, percentage) in enumerate(zip(top_importances, percentages)):
|
||||
ax.barh(i, importance, color='skyblue')
|
||||
ax.text(importance + 0.01, i, f'{percentage:.1f}%', va='center')
|
||||
|
||||
# 设置y轴标签和标题
|
||||
ax.set_yticks(range(len(top_features)))
|
||||
ax.set_yticklabels(top_features)
|
||||
ax.set_xlabel('特征重要性')
|
||||
ax.set_title('特征重要性排序(前10位)')
|
||||
|
||||
# 调整布局并显示
|
||||
plt.tight_layout()
|
||||
# plt.show()
|
||||
# 保存图片
|
||||
plt.savefig(os.path.join(config.dataset, '特征重要度1.png'))
|
||||
plt.close()
|
||||
config.logger.info('特征重要度1.png 已保存')
|
||||
|
||||
# 创建一个 LGBMRegressor 对象并训练模型
|
||||
regressor = lgb.LGBMRegressor()
|
||||
regressor.fit(X_train, y_train)
|
||||
# # 创建一个 LGBMRegressor 对象并训练模型
|
||||
# regressor = lgb.LGBMRegressor()
|
||||
# regressor.fit(X_train, y_train)
|
||||
|
||||
# 设置图形大小(可选)
|
||||
plt.figure(figsize=(30, 40))
|
||||
# 使用 plot_importance 函数来绘制特征重要性
|
||||
# 注意:在一些版本的 LightGBM 中,你可以直接传入模型对象
|
||||
ax = importance_plot = lgb.plot_importance(
|
||||
regressor, importance_type='gain') # 或者 'split'
|
||||
# 设置标题和字体大小
|
||||
ax.set_title('Feature Importance - LGBMRegressor', fontsize=12)
|
||||
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
|
||||
ax.get_xticklabels() + ax.get_yticklabels()):
|
||||
item.set_fontsize(9) # 设置 x 轴、y 轴标签以及刻度标签的字体大小
|
||||
# 修改图的标题,添加模型名称
|
||||
# title = 'Feature Importance - LGBMRegressor ' # 替换为你的模型名称
|
||||
# ax.set_title(title)
|
||||
# 保存图片
|
||||
plt.savefig(os.path.join(config.dataset, '特征重要度2.png'))
|
||||
# # 设置图形大小(可选)
|
||||
# plt.figure(figsize=(30, 40))
|
||||
# # 使用 plot_importance 函数来绘制特征重要性
|
||||
# # 注意:在一些版本的 LightGBM 中,你可以直接传入模型对象
|
||||
# ax = importance_plot = lgb.plot_importance(
|
||||
# regressor, importance_type='gain') # 或者 'split'
|
||||
# # 设置标题和字体大小
|
||||
# ax.set_title('Feature Importance - LGBMRegressor', fontsize=12)
|
||||
# for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
|
||||
# ax.get_xticklabels() + ax.get_yticklabels()):
|
||||
# item.set_fontsize(9) # 设置 x 轴、y 轴标签以及刻度标签的字体大小
|
||||
# # 修改图的标题,添加模型名称
|
||||
# # title = 'Feature Importance - LGBMRegressor ' # 替换为你的模型名称
|
||||
# # ax.set_title(title)
|
||||
# # 保存图片
|
||||
# plt.savefig(os.path.join(config.dataset, '特征重要度2.png'))
|
||||
|
||||
# 显示图形
|
||||
plt.close()
|
||||
config.logger.info('特征重要度2.png 已保存')
|
||||
# # 显示图形
|
||||
# plt.close()
|
||||
# config.logger.info('特征重要度2.png 已保存')
|
||||
|
||||
|
||||
def corr_feature(df):
|
||||
@ -807,7 +829,7 @@ def calculate_kdj(data, n=9):
|
||||
|
||||
def calculate_correlation(df):
|
||||
try:
|
||||
yy = df['y']
|
||||
yy = df['y'][-30:]
|
||||
# 去掉ds y
|
||||
df = df.drop(columns=['ds', 'y'])
|
||||
# 计算相关系数
|
||||
@ -1088,25 +1110,26 @@ def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_t
|
||||
df = df[df['ds'] <= end_time]
|
||||
config.logger.info(f'删除两月不更新特征前数据量:{df.shape}')
|
||||
# 去掉近最后数据对应的日期在两月以前的列,删除近2月的数据是常数的列
|
||||
# current_date = datetime.datetime.now()
|
||||
# two_months_ago = current_date - timedelta(days=40)
|
||||
current_date = datetime.datetime.strptime(
|
||||
global_config['end_time'], '%Y-%m-%d')
|
||||
two_months_ago = current_date - timedelta(days=40)
|
||||
# 检查两月不更新的特征
|
||||
|
||||
# def check_column(col_name):
|
||||
# if 'ds' in col_name or 'y' in col_name:
|
||||
# return False
|
||||
# df_check_column = df[['ds', col_name]]
|
||||
# df_check_column = df_check_column.dropna()
|
||||
# if len(df_check_column) == 0:
|
||||
# return True
|
||||
# if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2:
|
||||
# return True
|
||||
# corresponding_date = df_check_column.iloc[-1]['ds']
|
||||
# return corresponding_date < two_months_ago
|
||||
# columns_to_drop = df.columns[df.columns.map(check_column)].tolist()
|
||||
# df = df.drop(columns=columns_to_drop)
|
||||
def check_column(col_name):
|
||||
if 'ds' in col_name or 'y' in col_name:
|
||||
return False
|
||||
df_check_column = df[['ds', col_name]]
|
||||
df_check_column = df_check_column.dropna()
|
||||
if len(df_check_column) == 0:
|
||||
return True
|
||||
if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2:
|
||||
return True
|
||||
corresponding_date = df_check_column.iloc[-1]['ds']
|
||||
return corresponding_date < two_months_ago
|
||||
columns_to_drop = df.columns[df.columns.map(check_column)].tolist()
|
||||
df = df.drop(columns=columns_to_drop)
|
||||
|
||||
# config.logger.info(f'删除两月不更新特征后数据量:{df.shape}')
|
||||
config.logger.info(f'删除两月不更新特征后数据量:{df.shape}')
|
||||
|
||||
# 衍生时间特征
|
||||
if is_timefurture:
|
||||
|
@ -94,26 +94,32 @@ global_config.update({
|
||||
def push_market_value():
|
||||
logger.info('发送预测结果到市场信息平台')
|
||||
current_end_time = global_config['end_time']
|
||||
previous_trading_day = (pd.Timestamp(current_end_time) -
|
||||
pd.tseries.offsets.BusinessDay(1)).strftime('%Y-%m-%d')
|
||||
previous_trading_day = (pd.Timestamp(current_end_time) -
|
||||
pd.tseries.offsets.BusinessDay(1)).strftime('%Y-%m-%d')
|
||||
|
||||
# 读取预测数据和模型评估数据
|
||||
best_bdwd_price = find_best_models(
|
||||
date=previous_trading_day, global_config=global_config)
|
||||
|
||||
# 获取本月最佳模型的预测价格
|
||||
four_month_predict_price = pd.read_csv(os.path.join(global_config['dataset'], 'predict.csv'))
|
||||
four_month_predict_price['ds'] = pd.to_datetime(four_month_predict_price['ds'])
|
||||
|
||||
# 获取本月最佳模型的预测价格
|
||||
four_month_predict_price = pd.read_csv(
|
||||
os.path.join(global_config['dataset'], 'predict.csv'))
|
||||
four_month_predict_price['ds'] = pd.to_datetime(
|
||||
four_month_predict_price['ds'])
|
||||
# 设置索引 次月 次二月 次三月 次四月
|
||||
index_labels = ["次月", "次二月", "次三月", "次四月"]
|
||||
four_month_predict_price.index = index_labels
|
||||
four_month_predict_price.index = index_labels
|
||||
global_config['logger'].info(f"best_bdwd_price: {best_bdwd_price}")
|
||||
|
||||
# 准备要推送的数据
|
||||
ciyue_mean = four_month_predict_price[best_bdwd_price['next_month_price']['model_name']].iloc[0]
|
||||
cieryue_mean = four_month_predict_price[best_bdwd_price['next_february_price']['model_name']].iloc[1]
|
||||
cisanyue_mean = four_month_predict_price[best_bdwd_price['next_march_price']['model_name']].iloc[2]
|
||||
cisieryue_mean = four_month_predict_price[best_bdwd_price['next_april_price']['model_name']].iloc[3]
|
||||
ciyue_mean = four_month_predict_price[best_bdwd_price['next_month_price']
|
||||
['model_name']].iloc[0]
|
||||
cieryue_mean = four_month_predict_price[best_bdwd_price['next_february_price']
|
||||
['model_name']].iloc[1]
|
||||
cisanyue_mean = four_month_predict_price[best_bdwd_price['next_march_price']
|
||||
['model_name']].iloc[2]
|
||||
cisieryue_mean = four_month_predict_price[best_bdwd_price['next_april_price']
|
||||
['model_name']].iloc[3]
|
||||
# # 保留两位小数
|
||||
ciyue_mean = round(ciyue_mean, 2)
|
||||
cieryue_mean = round(cieryue_mean, 2)
|
||||
@ -331,8 +337,8 @@ def predict_main():
|
||||
else:
|
||||
# 读取数据
|
||||
logger.info('读取本地数据:' + os.path.join(dataset, data_set))
|
||||
df, df_zhibiaoliebiao = getdata_zhoudu_juxiting(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj,
|
||||
is_timefurture=is_timefurture, end_time=end_time) # 原始数据,未处理
|
||||
df, df_zhibiaoliebiao = getdata_juxiting(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj,
|
||||
is_timefurture=is_timefurture, end_time=end_time) # 原始数据,未处理
|
||||
|
||||
# 更改预测列名称
|
||||
df.rename(columns={y: 'y'}, inplace=True)
|
||||
@ -450,25 +456,25 @@ def predict_main():
|
||||
|
||||
now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
||||
ex_Model_Juxiting(df,
|
||||
horizon=global_config['horizon'],
|
||||
input_size=global_config['input_size'],
|
||||
train_steps=global_config['train_steps'],
|
||||
val_check_steps=global_config['val_check_steps'],
|
||||
early_stop_patience_steps=global_config['early_stop_patience_steps'],
|
||||
is_debug=global_config['is_debug'],
|
||||
dataset=global_config['dataset'],
|
||||
is_train=global_config['is_train'],
|
||||
is_fivemodels=global_config['is_fivemodels'],
|
||||
val_size=global_config['val_size'],
|
||||
test_size=global_config['test_size'],
|
||||
settings=global_config['settings'],
|
||||
now=now,
|
||||
etadata=etadata,
|
||||
modelsindex=global_config['modelsindex'],
|
||||
data=data,
|
||||
is_eta=global_config['is_eta'],
|
||||
end_time=global_config['end_time'],
|
||||
)
|
||||
horizon=global_config['horizon'],
|
||||
input_size=global_config['input_size'],
|
||||
train_steps=global_config['train_steps'],
|
||||
val_check_steps=global_config['val_check_steps'],
|
||||
early_stop_patience_steps=global_config['early_stop_patience_steps'],
|
||||
is_debug=global_config['is_debug'],
|
||||
dataset=global_config['dataset'],
|
||||
is_train=global_config['is_train'],
|
||||
is_fivemodels=global_config['is_fivemodels'],
|
||||
val_size=global_config['val_size'],
|
||||
test_size=global_config['test_size'],
|
||||
settings=global_config['settings'],
|
||||
now=now,
|
||||
etadata=etadata,
|
||||
modelsindex=global_config['modelsindex'],
|
||||
data=data,
|
||||
is_eta=global_config['is_eta'],
|
||||
end_time=global_config['end_time'],
|
||||
)
|
||||
|
||||
logger.info('模型训练完成')
|
||||
|
||||
|
@ -350,7 +350,10 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear
|
||||
config.logger.info(df_test.head())
|
||||
|
||||
# 特征重要度
|
||||
X_train = df_train.drop(columns=['y'])
|
||||
X_train = df_train.drop(columns=['y', 'ds'])
|
||||
if 'yearmonthweeks' in df_train.columns:
|
||||
X_train = df_train.drop(columns=['yearmonthweeks'])
|
||||
|
||||
y_train = df_train['y']
|
||||
feature_importance(X_train=X_train, y_train=y_train)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user