聚烯烃月度预测数据处理
This commit is contained in:
parent
b8ca24e07b
commit
8a2cf77639
@ -434,7 +434,7 @@ DEFAULT_CONFIG = {
|
|||||||
# 开关
|
# 开关
|
||||||
is_train = True # 是否训练
|
is_train = True # 是否训练
|
||||||
is_debug = False # 是否调试
|
is_debug = False # 是否调试
|
||||||
is_eta = True # 是否使用eta接口
|
is_eta = False # 是否使用eta接口
|
||||||
is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
|
is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
|
||||||
is_timefurture = True # 是否使用时间特征
|
is_timefurture = True # 是否使用时间特征
|
||||||
is_fivemodels = False # 是否使用之前保存的最佳的5个模型
|
is_fivemodels = False # 是否使用之前保存的最佳的5个模型
|
||||||
|
@ -460,7 +460,7 @@ DEFAULT_CONFIG = {
|
|||||||
# 开关
|
# 开关
|
||||||
is_train = True # 是否训练
|
is_train = True # 是否训练
|
||||||
is_debug = False # 是否调试
|
is_debug = False # 是否调试
|
||||||
is_eta = True # 是否使用eta接口
|
is_eta = False # 是否使用eta接口
|
||||||
is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
|
is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效
|
||||||
is_timefurture = True # 是否使用时间特征
|
is_timefurture = True # 是否使用时间特征
|
||||||
is_fivemodels = False # 是否使用之前保存的最佳的5个模型
|
is_fivemodels = False # 是否使用之前保存的最佳的5个模型
|
||||||
|
113
lib/dataread.py
113
lib/dataread.py
@ -568,44 +568,66 @@ def feature_importance(X_train, y_train):
|
|||||||
temp = XGBRegressor()
|
temp = XGBRegressor()
|
||||||
temp.fit(X_train, y_train)
|
temp.fit(X_train, y_train)
|
||||||
|
|
||||||
ax = plot_importance(temp)
|
# 获取特征重要性
|
||||||
fig = ax.figure
|
importances = temp.feature_importances_
|
||||||
fig.set_size_inches(8, 7)
|
indices = np.argsort(importances)[::-1] # 按重要性降序排列
|
||||||
|
|
||||||
# 修改图的标题,添加模型名称
|
# 获取前10个特征的索引和重要性
|
||||||
title = '特征重要度1' # 替换为你的模型名称
|
top_indices = indices[:10]
|
||||||
ax.set_title(title)
|
top_importances = importances[top_indices]
|
||||||
|
top_features = [X_train.columns[i] for i in top_indices]
|
||||||
|
|
||||||
|
# 计算百分比(相对于所有特征的总和)
|
||||||
|
total_all_importance = sum(importances) # 所有特征的重要性总和
|
||||||
|
percentages = (top_importances / total_all_importance) * 100
|
||||||
|
|
||||||
|
# 绘制特征重要性图
|
||||||
|
plt.figure(figsize=(10, 8))
|
||||||
|
ax = plt.gca()
|
||||||
|
|
||||||
|
# 绘制条形图
|
||||||
|
for i, (importance, percentage) in enumerate(zip(top_importances, percentages)):
|
||||||
|
ax.barh(i, importance, color='skyblue')
|
||||||
|
ax.text(importance + 0.01, i, f'{percentage:.1f}%', va='center')
|
||||||
|
|
||||||
|
# 设置y轴标签和标题
|
||||||
|
ax.set_yticks(range(len(top_features)))
|
||||||
|
ax.set_yticklabels(top_features)
|
||||||
|
ax.set_xlabel('特征重要性')
|
||||||
|
ax.set_title('特征重要性排序(前10位)')
|
||||||
|
|
||||||
|
# 调整布局并显示
|
||||||
|
plt.tight_layout()
|
||||||
# plt.show()
|
# plt.show()
|
||||||
# 保存图片
|
# 保存图片
|
||||||
plt.savefig(os.path.join(config.dataset, '特征重要度1.png'))
|
plt.savefig(os.path.join(config.dataset, '特征重要度1.png'))
|
||||||
plt.close()
|
plt.close()
|
||||||
config.logger.info('特征重要度1.png 已保存')
|
config.logger.info('特征重要度1.png 已保存')
|
||||||
|
|
||||||
# 创建一个 LGBMRegressor 对象并训练模型
|
# # 创建一个 LGBMRegressor 对象并训练模型
|
||||||
regressor = lgb.LGBMRegressor()
|
# regressor = lgb.LGBMRegressor()
|
||||||
regressor.fit(X_train, y_train)
|
# regressor.fit(X_train, y_train)
|
||||||
|
|
||||||
# 设置图形大小(可选)
|
# # 设置图形大小(可选)
|
||||||
plt.figure(figsize=(30, 40))
|
# plt.figure(figsize=(30, 40))
|
||||||
# 使用 plot_importance 函数来绘制特征重要性
|
# # 使用 plot_importance 函数来绘制特征重要性
|
||||||
# 注意:在一些版本的 LightGBM 中,你可以直接传入模型对象
|
# # 注意:在一些版本的 LightGBM 中,你可以直接传入模型对象
|
||||||
ax = importance_plot = lgb.plot_importance(
|
# ax = importance_plot = lgb.plot_importance(
|
||||||
regressor, importance_type='gain') # 或者 'split'
|
# regressor, importance_type='gain') # 或者 'split'
|
||||||
# 设置标题和字体大小
|
# # 设置标题和字体大小
|
||||||
ax.set_title('Feature Importance - LGBMRegressor', fontsize=12)
|
# ax.set_title('Feature Importance - LGBMRegressor', fontsize=12)
|
||||||
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
|
# for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
|
||||||
ax.get_xticklabels() + ax.get_yticklabels()):
|
# ax.get_xticklabels() + ax.get_yticklabels()):
|
||||||
item.set_fontsize(9) # 设置 x 轴、y 轴标签以及刻度标签的字体大小
|
# item.set_fontsize(9) # 设置 x 轴、y 轴标签以及刻度标签的字体大小
|
||||||
# 修改图的标题,添加模型名称
|
# # 修改图的标题,添加模型名称
|
||||||
# title = 'Feature Importance - LGBMRegressor ' # 替换为你的模型名称
|
# # title = 'Feature Importance - LGBMRegressor ' # 替换为你的模型名称
|
||||||
# ax.set_title(title)
|
# # ax.set_title(title)
|
||||||
# 保存图片
|
# # 保存图片
|
||||||
plt.savefig(os.path.join(config.dataset, '特征重要度2.png'))
|
# plt.savefig(os.path.join(config.dataset, '特征重要度2.png'))
|
||||||
|
|
||||||
# 显示图形
|
# # 显示图形
|
||||||
plt.close()
|
# plt.close()
|
||||||
config.logger.info('特征重要度2.png 已保存')
|
# config.logger.info('特征重要度2.png 已保存')
|
||||||
|
|
||||||
|
|
||||||
def corr_feature(df):
|
def corr_feature(df):
|
||||||
@ -807,7 +829,7 @@ def calculate_kdj(data, n=9):
|
|||||||
|
|
||||||
def calculate_correlation(df):
|
def calculate_correlation(df):
|
||||||
try:
|
try:
|
||||||
yy = df['y']
|
yy = df['y'][-30:]
|
||||||
# 去掉ds y
|
# 去掉ds y
|
||||||
df = df.drop(columns=['ds', 'y'])
|
df = df.drop(columns=['ds', 'y'])
|
||||||
# 计算相关系数
|
# 计算相关系数
|
||||||
@ -1088,25 +1110,26 @@ def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_t
|
|||||||
df = df[df['ds'] <= end_time]
|
df = df[df['ds'] <= end_time]
|
||||||
config.logger.info(f'删除两月不更新特征前数据量:{df.shape}')
|
config.logger.info(f'删除两月不更新特征前数据量:{df.shape}')
|
||||||
# 去掉近最后数据对应的日期在两月以前的列,删除近2月的数据是常数的列
|
# 去掉近最后数据对应的日期在两月以前的列,删除近2月的数据是常数的列
|
||||||
# current_date = datetime.datetime.now()
|
current_date = datetime.datetime.strptime(
|
||||||
# two_months_ago = current_date - timedelta(days=40)
|
global_config['end_time'], '%Y-%m-%d')
|
||||||
|
two_months_ago = current_date - timedelta(days=40)
|
||||||
# 检查两月不更新的特征
|
# 检查两月不更新的特征
|
||||||
|
|
||||||
# def check_column(col_name):
|
def check_column(col_name):
|
||||||
# if 'ds' in col_name or 'y' in col_name:
|
if 'ds' in col_name or 'y' in col_name:
|
||||||
# return False
|
return False
|
||||||
# df_check_column = df[['ds', col_name]]
|
df_check_column = df[['ds', col_name]]
|
||||||
# df_check_column = df_check_column.dropna()
|
df_check_column = df_check_column.dropna()
|
||||||
# if len(df_check_column) == 0:
|
if len(df_check_column) == 0:
|
||||||
# return True
|
return True
|
||||||
# if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2:
|
if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2:
|
||||||
# return True
|
return True
|
||||||
# corresponding_date = df_check_column.iloc[-1]['ds']
|
corresponding_date = df_check_column.iloc[-1]['ds']
|
||||||
# return corresponding_date < two_months_ago
|
return corresponding_date < two_months_ago
|
||||||
# columns_to_drop = df.columns[df.columns.map(check_column)].tolist()
|
columns_to_drop = df.columns[df.columns.map(check_column)].tolist()
|
||||||
# df = df.drop(columns=columns_to_drop)
|
df = df.drop(columns=columns_to_drop)
|
||||||
|
|
||||||
# config.logger.info(f'删除两月不更新特征后数据量:{df.shape}')
|
config.logger.info(f'删除两月不更新特征后数据量:{df.shape}')
|
||||||
|
|
||||||
# 衍生时间特征
|
# 衍生时间特征
|
||||||
if is_timefurture:
|
if is_timefurture:
|
||||||
|
@ -102,18 +102,24 @@ def push_market_value():
|
|||||||
date=previous_trading_day, global_config=global_config)
|
date=previous_trading_day, global_config=global_config)
|
||||||
|
|
||||||
# 获取本月最佳模型的预测价格
|
# 获取本月最佳模型的预测价格
|
||||||
four_month_predict_price = pd.read_csv(os.path.join(global_config['dataset'], 'predict.csv'))
|
four_month_predict_price = pd.read_csv(
|
||||||
four_month_predict_price['ds'] = pd.to_datetime(four_month_predict_price['ds'])
|
os.path.join(global_config['dataset'], 'predict.csv'))
|
||||||
|
four_month_predict_price['ds'] = pd.to_datetime(
|
||||||
|
four_month_predict_price['ds'])
|
||||||
# 设置索引 次月 次二月 次三月 次四月
|
# 设置索引 次月 次二月 次三月 次四月
|
||||||
index_labels = ["次月", "次二月", "次三月", "次四月"]
|
index_labels = ["次月", "次二月", "次三月", "次四月"]
|
||||||
four_month_predict_price.index = index_labels
|
four_month_predict_price.index = index_labels
|
||||||
global_config['logger'].info(f"best_bdwd_price: {best_bdwd_price}")
|
global_config['logger'].info(f"best_bdwd_price: {best_bdwd_price}")
|
||||||
|
|
||||||
# 准备要推送的数据
|
# 准备要推送的数据
|
||||||
ciyue_mean = four_month_predict_price[best_bdwd_price['next_month_price']['model_name']].iloc[0]
|
ciyue_mean = four_month_predict_price[best_bdwd_price['next_month_price']
|
||||||
cieryue_mean = four_month_predict_price[best_bdwd_price['next_february_price']['model_name']].iloc[1]
|
['model_name']].iloc[0]
|
||||||
cisanyue_mean = four_month_predict_price[best_bdwd_price['next_march_price']['model_name']].iloc[2]
|
cieryue_mean = four_month_predict_price[best_bdwd_price['next_february_price']
|
||||||
cisieryue_mean = four_month_predict_price[best_bdwd_price['next_april_price']['model_name']].iloc[3]
|
['model_name']].iloc[1]
|
||||||
|
cisanyue_mean = four_month_predict_price[best_bdwd_price['next_march_price']
|
||||||
|
['model_name']].iloc[2]
|
||||||
|
cisieryue_mean = four_month_predict_price[best_bdwd_price['next_april_price']
|
||||||
|
['model_name']].iloc[3]
|
||||||
# # 保留两位小数
|
# # 保留两位小数
|
||||||
ciyue_mean = round(ciyue_mean, 2)
|
ciyue_mean = round(ciyue_mean, 2)
|
||||||
cieryue_mean = round(cieryue_mean, 2)
|
cieryue_mean = round(cieryue_mean, 2)
|
||||||
@ -331,7 +337,7 @@ def predict_main():
|
|||||||
else:
|
else:
|
||||||
# 读取数据
|
# 读取数据
|
||||||
logger.info('读取本地数据:' + os.path.join(dataset, data_set))
|
logger.info('读取本地数据:' + os.path.join(dataset, data_set))
|
||||||
df, df_zhibiaoliebiao = getdata_zhoudu_juxiting(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj,
|
df, df_zhibiaoliebiao = getdata_juxiting(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj,
|
||||||
is_timefurture=is_timefurture, end_time=end_time) # 原始数据,未处理
|
is_timefurture=is_timefurture, end_time=end_time) # 原始数据,未处理
|
||||||
|
|
||||||
# 更改预测列名称
|
# 更改预测列名称
|
||||||
|
@ -350,7 +350,10 @@ def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, ear
|
|||||||
config.logger.info(df_test.head())
|
config.logger.info(df_test.head())
|
||||||
|
|
||||||
# 特征重要度
|
# 特征重要度
|
||||||
X_train = df_train.drop(columns=['y'])
|
X_train = df_train.drop(columns=['y', 'ds'])
|
||||||
|
if 'yearmonthweeks' in df_train.columns:
|
||||||
|
X_train = df_train.drop(columns=['yearmonthweeks'])
|
||||||
|
|
||||||
y_train = df_train['y']
|
y_train = df_train['y']
|
||||||
feature_importance(X_train=X_train, y_train=y_train)
|
feature_importance(X_train=X_train, y_train=y_train)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user