PriceForecast/codeback.py
2024-11-21 13:27:54 +08:00

494 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

################################### 报告内容 ##################################
# 根据真实值分组,去掉最高最小预测值画图逻辑
# content.append(Graphs.draw_text('图示说明:'))
# content.append(Graphs.draw_text('1. 将所有模型的预测结果进行分组,大于真实值的为一组,小于真实值的为一组,去掉最高的预测值,去掉最小的预测值'))
# content.append(Graphs.draw_text('2. 确定通道上界:在大于真实值的分组中,取最大的预测值'))
# content.append(Graphs.draw_text('3. 确定通道下界:在小于真实值的分组中,取第二小的预测值'))
# content.append(Graphs.draw_text('4. 预测结果没有真实值作为参考依据通道上界取近20个交易日内预测在上界值的模型对应的预测值通道下界同理'))
# content.append(Graphs.draw_text('5. 预测结果选用近20个交易日内最多接近真实值的模型的预测值对应的预测结果'))
# content.append(Graphs.draw_text('6. 预测结果在通道外的,代表最接近真实值的预测结果不在置信波动范围内。'))
# 波动率画图逻辑
# content.append(Graphs.draw_text('图示说明:'))
# content.append(Graphs.draw_text('1. 确定波动率置信区间统计近60个交易日的真实价格波动率找出在 10% 90% 的分位值作为波动率置信区间;'))
# content.append(Graphs.draw_text('2. 确定通道上界:在所有模型的预测结果中 <= 前一天真实价格 乘以 90%的置信波动分位数'))
# content.append(Graphs.draw_text('3. 确定通道下界:在所有模型的预测结果中 >= 前一天真实价格 乘以 10%的置信波动分位数'))
# content.append(Graphs.draw_text('4. 预测结果没有真实值作为参考依据通道上界取近20个交易日内预测在上界值的模型对应的预测值通道下界同理'))
# content.append(Graphs.draw_text('5. 预测结果选用近20个交易日内最多接近真实值的模型的预测值对应的预测结果'))
# content.append(Graphs.draw_text('6. 预测结果在通道外的,代表最接近真实值的预测结果不在置信波动范围内。'))
# # 计算特征相关性
# data.rename(columns={y: 'y'}, inplace=True)
# data['ds'] = pd.to_datetime(data['ds'])
# data.drop(columns=['ds'], inplace=True)
# # 创建一个空的 DataFrame 来保存相关系数
# correlation_df = pd.DataFrame(columns=['Feature', 'Correlation'])
# # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 Data 中
# for col in data.columns:
# if col!= 'y':
# pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1]
# spearman_correlation, _ = spearmanr(data[col], data['y'])
# new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)}
# correlation_df = correlation_df._append(new_row, ignore_index=True)
# correlation_df.drop('Correlation', axis=1, inplace=True)
# correlation_df.dropna(inplace=True)
# correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False)
# data = correlation_df['Pearson_Correlation'].values.tolist()
# # 生成 -1 到 1 的 20 个区间
# bins = np.linspace(-1, 1, 21)
# # 计算每个区间的统计数(这里是区间内数据的数量)
# hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
# #设置画布大小
# plt.figure(figsize=(10, 6))
# # 绘制直方图
# plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# # 添加标题和坐标轴标签
# plt.title('皮尔逊相关系数分布图')
# plt.xlabel('区间')
# plt.ylabel('统计数')
# plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png'))
# plt.close()
# #设置画布大小
# plt.figure(figsize=(10, 6))
# data = correlation_df['Spearman_Correlation'].values.tolist()
# # 计算每个区间的统计数(这里是区间内数据的数量)
# hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
# # 绘制直方图
# plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# # 添加标题和坐标轴标签
# plt.title('斯皮尔曼相关系数分布图')
# plt.xlabel('区间')
# plt.ylabel('统计数')
# plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png'))
# plt.close()
# content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:'))
# # 皮尔逊正相关 不相关 负相关 的表格
# content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png')))
# content.append(Graphs.draw_text('''皮尔逊相关系数说明:'''))
# content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。'''))
# content.append(Graphs.draw_text('''
# 相关系数为1表示两个变量之间存在完全正向的线性关系即当一个变量增加时另一个变量也相应增加且变化是完全一致的。'''))
# content.append(Graphs.draw_text('''当前特征中正相关前十的有:'''))
# top10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature'].to_list()
# top10 = ','.join(top10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# feature_df = feature_data_df[['ds','y']+top10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(0,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:'))
# # 皮尔逊正相关 不相关 负相关 的表格
# content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png')))
# content.append(Graphs.draw_text('斯皮尔曼相关系数Spearmans rank correlation coefficient是一种用于衡量两个变量之间的单调关系不一定是线性关系的统计指标。'))
# content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。'))
# content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。'))
# content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;'))
# content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:'''))
# top10_columns = correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature'].to_list()
# top10 = ','.join(top10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# feature_df = feature_data_df[['ds','y']+top10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(0,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;'))
# content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:'''))
# tail10_columns = correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature'].to_list()
# top10 = ','.join(tail10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# # 获取特征的近一周值
# feature_df = feature_data_df[['ds','y']+tail10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(len(feature_df)):
# if j%2 == 1:
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。'))
# content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。'))
################################### 预测值与真实值绘图逻辑 ##################################
# # 根据真实值y确定最大最小值,去掉最高最低的预测值
# import heapq # 使用堆来找到最大和最小的值
# def find_min_max_within_quantile(row):
# true_value = row['y']
# row.drop(['ds','y'], inplace=True)
# row = row.astype(float).round(2)
# max_heap = []
# min_heap = []
# for col in row.index:
# # 对比真实值进行分类
# if row[col] < true_value:
# heapq.heappush(min_heap, row[col])
# elif row[col] > true_value:
# heapq.heappush(max_heap, -row[col]) # 使用负号来实现最大堆
# if len(max_heap) == 1:
# max_y = max_heap[0]
# elif len(max_heap) == 0:
# max_y = -min_heap[-1]
# else:
# max_y = heapq.nsmallest(2, max_heap)[1]
# if len(min_heap) < 2 :
# min_y = -max_heap[-1]
# else:
# min_y = heapq.nsmallest(2, min_heap)[-1]
# # 获取最大和最小的值
# q10 = min_y
# q90 = -max_y
# # 获取最大和最小的模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmax()
# # 设置上下界比例
# rote = 1
# q10 = q10 * rote
# q90 = q90 * rote
# logger.info(min_model,q10,max_model,q90)
# return pd.Series([q10, q90, min_model, max_model], index=['min_within_quantile', 'max_within_quantile', 'min_model', 'max_model'])
# # # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
#使用最佳五个模型进行绘图
# best_models = pd.read_csv(os.path.join(dataset,'best_modelnames.txt'),header=None).values.flatten().tolist()
# def find_min_max_within_quantile(row):
# row = row[best_models]
# q10 = row.min()
# q90 = row.max()
# # 获取 row行最大最小值模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmin()
# # # 判断flot值是否为空值
# # if pd.isna(q10) or pd.isna(q90):
# return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
# # 通道使用模型评估前80%作为置信度
# def find_min_max_within_quantile(row):
# row.drop(['ds','y'], inplace=True)
# row = row.astype(float).round(2)
# row_sorted = row
# # 计算 10% 和 90% 位置的索引
# index_10 = 0
# index_90 = int(len(row_sorted) * 0.8)
# q10 = row_sorted[index_10]
# q90 = row_sorted[index_90]
# # 获取模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmin()
# # # 判断flot值是否为空值
# # if pd.isna(q10) or pd.isna(q90):
# return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 重新排列
# df_combined3 = df_combined3[['ds','y'] + allmodelnames]
# # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
# # 通道使用预测模型的80%置信度
# def find_min_max_within_quantile(row):
# row.drop(['ds','y'], inplace=True)
# row = row.astype(float).round(2)
# row_sorted = row.sort_values(ascending=True).reset_index(drop=True)
# # 计算 10% 和 90% 位置的索引
# index_10 = int(len(row_sorted) * 0.1)
# index_90 = int(len(row_sorted) * 0.9)
# q10 = row_sorted[index_10]
# q90 = row_sorted[index_90]
# # 获取模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmin()
# # # 判断flot值是否为空值
# # if pd.isna(q10) or pd.isna(q90):
# return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
# # 计算波动率
# df_combined3['volatility'] = df_combined3['y'].pct_change().round(4)
# # 计算近60日的波动率 10% 90%分位数
# df_combined3['quantile_10'] = df_combined3['volatility'].rolling(60).quantile(0.1)
# df_combined3['quantile_90'] = df_combined3['volatility'].rolling(60).quantile(0.9)
# df_combined3 = df_combined3.round(4)
# # 计算分位数对应的价格
# df_combined3['quantile_10_price'] = df_combined3['y'] * (1 + df_combined3['quantile_10'])
# df_combined3['quantile_90_price'] = df_combined3['y'] * (1 + df_combined3['quantile_90'])
# # 遍历行
# def find_min_max_within_quantile(row):
# # 获取分位数10%和90%的值
# q10 = row['quantile_10_price']
# q90 = row['quantile_90_price']
# # 判断flot值是否为空值
# if pd.isna(q10) or pd.isna(q90):
# return pd.Series([None, None, None, None], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 初始化最小和最大值为None
# min_value = None
# max_value = None
# min_value_model = ''
# max_value_model = ''
# # 遍历指定列,找出在分位数范围内的最大最小值
# for model in modelnames:
# value = row[model]
# if value >= q10 and value <= q90:
# if min_value is None or value < min_value:
# min_value = value
# min_value_model = model
# if max_value is None or value > max_value:
# max_value = value
# max_value_model = model
# 返回最大最小值
# return pd.Series([min_value, max_value,min_value_model,max_value_model], index=['min_within_quantile', 'max_within_quantile','min_model','max_model'])
# # # 应用函数到每一行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# # 去除有空值的行
# df_combined3.dropna(inplace=True)
# # # 保存到数据库
# df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)
# df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
# # 去掉方差最大的模型,其余模型预测最大最小值确定通道边界
# # 历史数据+预测数据
# # 拼接未来时间预测
# df_predict = loadcsv(os.path.join(dataset,'predict.csv'))
# df_predict.drop('unique_id',inplace=True,axis=1)
# df_predict.dropna(axis=1,inplace=True)
# df_predict2 = df_predict.copy()
# try:
# df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')
# except ValueError :
# df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')
# # 取第一行数据存储到数据库中
# first_row = df_predict.head(1)
# first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# # # 将预测结果保存到数据库
# df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)
# # # 判断 df 的数值列转为float
# for col in df_combined3.columns:
# try:
# if col != 'ds':
# df_combined3[col] = df_combined3[col].astype(float)
# df_combined3[col] = df_combined3[col].round(2)
# except ValueError:
# pass
# df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
# df_combined3['ds'] = df_combined3['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# # # 判断表存在
# if not sqlitedb.check_table_exists('testandpredict_groupby'):
# df_combined3.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)
# else:
# for row in df_combined3.itertuples(index=False):
# row_dict = row._asdict()
# check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f"ds = '{row.ds}'")
# if len(check_query) > 0:
# set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
# sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f"ds = '{row.ds}'")
# continue
# sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())
# ten_models = allmodelnames
# # 计算每个模型的方差
# variances = df_combined3[ten_models].var()
# # 找到方差最大的模型
# max_variance_model = variances.idxmax()
# # 打印方差最大的模型
# print("方差最大的模型是:", max_variance_model)
# # 去掉方差最大的模型
# df_combined3 = df_combined3.drop(columns=[max_variance_model])
# if max_variance_model in allmodelnames:
# allmodelnames.remove(max_variance_model)
# df_combined3['min'] = df_combined3[allmodelnames].min(axis=1)
# df_combined3['max'] = df_combined3[allmodelnames].max(axis=1)
# print(df_combined3[['min','max']])
# # 历史价格+预测价格
# df_combined3 = df_combined3[-50:] # 取50个数据点画图
# plt.figure(figsize=(20, 10))
# plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值',marker='o')
# plt.plot(df_combined3['ds'], df_combined3[most_model], label=most_model_name)
# plt.fill_between(df_combined3['ds'], df_combined3['min'], df_combined3['max'], alpha=0.2)
# plt.grid(True)
# # # 显示历史值
# for i, j in zip(df_combined3['ds'][:-5], df_combined3['y'][:-5]):
# plt.text(i, j, str(j), ha='center', va='bottom')
# # 当前日期画竖虚线
# plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--')
# plt.legend()
# plt.xlabel('日期')
# plt.ylabel('价格')
# plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')
# plt.close()