根据模型预测结果,对比真实值分组取出最大最小值

This commit is contained in:
liurui 2024-11-12 14:17:42 +08:00
parent 005c6c97e7
commit 83589bed46
5 changed files with 33348 additions and 480 deletions

View File

@ -178,7 +178,7 @@ is_update_report = False # 是否上传报告
# 数据截取日期 # 数据截取日期
end_time = '2024-10-28' # 数据截取日期 end_time = '2024-10-29' # 数据截取日期
delweekenday = True delweekenday = True
is_corr = False # 特征是否参与滞后领先提升相关系数 is_corr = False # 特征是否参与滞后领先提升相关系数
add_kdj = False # 是否添加kdj指标 add_kdj = False # 是否添加kdj指标

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -39,8 +39,8 @@ def predict_main():
edbbusinessurl=edbbusinessurl, edbbusinessurl=edbbusinessurl,
) )
# df_zhibiaoshuju,df_zhibiaoliebiao = etadata.get_eta_api_yuanyou_data(data_set=data_set,dataset=dataset) # 原始数据,未处理 df_zhibiaoshuju,df_zhibiaoliebiao = etadata.get_eta_api_yuanyou_data(data_set=data_set,dataset=dataset) # 原始数据,未处理
df_zhibiaoshuju,df_zhibiaoliebiao = etadata.get_eta_api_pp_data(data_set=data_set,dataset=dataset) # 原始数据,未处理 # df_zhibiaoshuju,df_zhibiaoliebiao = etadata.get_eta_api_pp_data(data_set=data_set,dataset=dataset) # 原始数据,未处理
# 数据处理 # 数据处理
@ -75,7 +75,7 @@ def predict_main():
import datetime import datetime
# 判断当前日期是不是周一 # 判断当前日期是不是周一
is_weekday = datetime.datetime.now().weekday() == 3 is_weekday = datetime.datetime.now().weekday() == 4
if is_weekday: if is_weekday:
logger.info('今天是周一,更新预测模型') logger.info('今天是周一,更新预测模型')
try: try:

View File

@ -289,7 +289,7 @@ def model_losss(sqlitedb):
df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1) df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# 去除有空值的行 # 去除有空值的行
df_combined3.dropna(inplace=True) # df_combined3.dropna(inplace=True)
# 保存到数据库 # 保存到数据库
df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False) df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)
df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False) df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
@ -503,7 +503,7 @@ def model_losss_juxiting(sqlitedb):
# 删除模型生成的cutoff列 # 删除模型生成的cutoff列
df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True) df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True)
# 获取模型名称 # 获取模型名称
modelnames = df_combined.columns.to_list()[2:] modelnames = df_combined.columns.to_list()[1:]
if 'y' in modelnames: if 'y' in modelnames:
modelnames.remove('y') modelnames.remove('y')
df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要 df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要
@ -534,26 +534,101 @@ def model_losss_juxiting(sqlitedb):
with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f: with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f:
f.write(','.join(modelnames) + '\n') f.write(','.join(modelnames) + '\n')
# 使用最佳五个模型进行绘图
best_models = pd.read_csv(os.path.join(dataset,'best_modelnames.txt'),header=None).values.flatten().tolist() # 根据真实值y确定最大最小值,去掉最高最低的预测值
import heapq # 使用堆来找到最大和最小的值
def find_min_max_within_quantile(row): def find_min_max_within_quantile(row):
row = row[best_models] true_value = row['y']
q10 = row.min() row.drop(['ds','y'], inplace=True)
q90 = row.max() row = row.astype(float).round(2)
# 获取 row行最大最小值模型名称
min_model = row[row == q10].idxmin()
max_model = row[row == q90].idxmin()
# # 判断flot值是否为空值
# if pd.isna(q10) or pd.isna(q90):
return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# 遍历行 max_heap = []
min_heap = []
for col in row.index:
# 对比真实值进行分类
if row[col] < true_value:
heapq.heappush(min_heap, row[col])
elif row[col] > true_value:
heapq.heappush(max_heap, -row[col]) # 使用负号来实现最大堆
if len(max_heap) == 1:
max_y = max_heap[0]
elif len(max_heap) == 0:
max_y = -min_heap[-1]
else:
max_y = heapq.nsmallest(2, max_heap)[1]
if len(min_heap) < 2 :
min_y = -max_heap[-1]
else:
min_y = heapq.nsmallest(2, min_heap)[-1]
# 获取最大和最小的值
q10 = min_y
q90 = -max_y
# 获取最大和最小的模型名称
min_model = row[row == q10].idxmin()
max_model = row[row == q90].idxmax()
# 设置上下界比例
q10 = q10 * 0.99
q90 = q90 * 0.99
logger.info(min_model,q10,max_model,q90)
return pd.Series([q10, q90, min_model, max_model], index=['min_within_quantile', 'max_within_quantile', 'min_model', 'max_model'])
# # 遍历行
df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1) df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
df_combined = df_combined.round(4) df_combined = df_combined.round(4)
print(df_combined3) print(df_combined3)
# 使用最佳五个模型进行绘图
# best_models = pd.read_csv(os.path.join(dataset,'best_modelnames.txt'),header=None).values.flatten().tolist()
# def find_min_max_within_quantile(row):
# row = row[best_models]
# q10 = row.min()
# q90 = row.max()
# # 获取 row行最大最小值模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmin()
# # # 判断flot值是否为空值
# # if pd.isna(q10) or pd.isna(q90):
# return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
# # 通道使用模型评估前80%作为置信度
# def find_min_max_within_quantile(row):
# row.drop(['ds','y'], inplace=True)
# row = row.astype(float).round(2)
# row_sorted = row
# # 计算 10% 和 90% 位置的索引
# index_10 = 0
# index_90 = int(len(row_sorted) * 0.8)
# q10 = row_sorted[index_10]
# q90 = row_sorted[index_90]
# # 获取模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmin()
# # # 判断flot值是否为空值
# # if pd.isna(q10) or pd.isna(q90):
# return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 重新排列
# df_combined3 = df_combined3[['ds','y'] + allmodelnames]
# # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
# # 通道使用预测模型的80%置信度 # # 通道使用预测模型的80%置信度
@ -641,7 +716,7 @@ def model_losss_juxiting(sqlitedb):
plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(15, 10)) plt.figure(figsize=(15, 10))
# 设置有5个子图的画布 # 设置有5个子图的画布
for n,model in enumerate(modelnames): for n,model in enumerate(modelnames[:5]):
plt.subplot(3, 2, n+1) plt.subplot(3, 2, n+1)
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值') plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
plt.plot(df_combined3['ds'], df_combined3[model], label=model) plt.plot(df_combined3['ds'], df_combined3[model], label=model)
@ -671,11 +746,11 @@ def model_losss_juxiting(sqlitedb):
if not sqlitedb.check_table_exists('trueandpredict'): if not sqlitedb.check_table_exists('trueandpredict'):
first_row.to_sql('trueandpredict',sqlitedb.connection,index=False) first_row.to_sql('trueandpredict',sqlitedb.connection,index=False)
else: else:
for col in first_row.columns:
sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT')
for row in first_row.itertuples(index=False): for row in first_row.itertuples(index=False):
row_dict = row._asdict() row_dict = row._asdict()
columns=row_dict.keys() columns=row_dict.keys()
for col in columns:
sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT')
check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'") check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'")
if len(check_query) > 0: if len(check_query) > 0:
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
@ -688,6 +763,10 @@ def model_losss_juxiting(sqlitedb):
# 最多频率的模型名称 # 最多频率的模型名称
min_model_max_frequency_model = df_combined3['min_model'].tail(20).value_counts().idxmax() min_model_max_frequency_model = df_combined3['min_model'].tail(20).value_counts().idxmax()
max_model_max_frequency_model = df_combined3['max_model'].tail(20).value_counts().idxmax() max_model_max_frequency_model = df_combined3['max_model'].tail(20).value_counts().idxmax()
if min_model_max_frequency_model == max_model_max_frequency_model:
# 取20天第二多的模型
max_model_max_frequency_model = df_combined3['max_model'].tail(20).value_counts().nlargest(2).index[1]
df_predict['min_model'] = min_model_max_frequency_model df_predict['min_model'] = min_model_max_frequency_model
df_predict['max_model'] = max_model_max_frequency_model df_predict['max_model'] = max_model_max_frequency_model
df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model] df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model]