PriceForecast/models/nerulforcastmodels.py
2024-11-20 14:14:27 +08:00

2591 lines
137 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
from lib.tools import Graphs,mse,rmse,mae
from lib.dataread import *
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS,Informer, NBEATSx,LSTM,PatchTST, iTransformer, TSMixer
from neuralforecast.models import RNN, GRU, TCN, DeepAR, DilatedRNN, MLP, NBEATS, DLinear, NLinear, TFT, VanillaTransformer
from neuralforecast.models import Autoformer, PatchTST, FEDformer, StemGNN, HINT, TSMixer, TSMixerx, MLPMultivariate, BiTCN, TiDE, DeepNPTS
from tensorflow.keras.losses import MAE
from scipy.stats import spearmanr
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn import metrics
from lib.duojinchengpredict import testSetPredict
from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类
from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch)
from reportlab.pdfbase import pdfmetrics # 注册字体
from reportlab.pdfbase.ttfonts import TTFont # 字体类
from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类
from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch)
from reportlab.lib.styles import getSampleStyleSheet # 文本样式
from reportlab.lib import colors # 颜色模块
from reportlab.graphics.charts.barcharts import VerticalBarChart # 图表类
from reportlab.graphics.charts.legends import Legend # 图例类
from reportlab.graphics.shapes import Drawing # 绘图工具
from reportlab.lib.units import cm # 单位cm
# # 注册字体(提前准备好字体文件, 如果同一个文件需要多种字体可以注册多个)
pdfmetrics.registerFont(TTFont('SimSun', 'SimSun.ttf'))
def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patience_steps,
is_debug,dataset,is_train,is_fivemodels,val_size,test_size,settings,now,
etadata,modelsindex,data,is_eta):
'''
模型训练与预测
:param df: 数据集
horizon # 预测的步长
input_size # 输入序列长度
train_steps # 训练步数,用来限定epoch次数
val_check_steps # 评估频率
early_stop_patience_steps # 早停的耐心步数
:return: 预测结果
'''
# 模型预测列表列名
# columns2 = [
# 'NHITS',
# 'Informer',
# 'LSTM',
# 'iTransformer',
# 'TSMixer',
# 'TSMixerx',
# 'PatchTST',
# 'RNN',
# 'GRU',
# 'TCN',
# # 'DeepAR',
# 'DeepAR-median',
# 'DeepAR-lo-90',
# 'DeepAR-lo-80',
# 'DeepAR-hi-80',
# 'DeepAR-hi-90',
# 'BiTCN',
# 'DilatedRNN',
# 'MLP',
# 'DLinear',
# 'NLinear',
# 'TFT',
# 'FEDformer',
# 'StemGNN',
# 'MLPMultivariate',
# 'TiDE',
# 'DeepNPT',
# ]
df= df.replace(',', '', regex=True)
df = df.rename(columns={'date': 'ds'})
df['y'] = pd.to_numeric(df['y'], errors='coerce')
df['ds'] = pd.to_datetime(df['ds'], errors='coerce') # 使用errors='coerce'来处理无效日期
# df 数值列转为 float32
for col in df.select_dtypes(include=['int']).columns:
df[col] = df[col].astype(np.float32)
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 不筛选特征用下面的
df_reg = df
df_reg.sort_values('ds', inplace=True)
if is_debug:
df_reg = df_reg[-1000:-1]
# 计算训练集的结束索引占总数据的90%
split_index = int(0.8* len(df_reg))
# 按照时间顺序划分训练集和测试集
df_train = df_reg[:split_index]
df_test = df_reg[-split_index:]
df_train['unique_id'] = 1
df_test['unique_id'] = 1
# 显示划分后的数据集的前几行
logger.info("Training set head:")
logger.info(df_train.head())
logger.info("\nTesting set head:")
logger.info(df_test.head())
models = [
NHITS (h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', activation='ReLU', early_stop_patience_steps=early_stop_patience_steps),
Informer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps ),
LSTM(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
iTransformer(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
TSMixer(h=horizon, input_size=input_size, n_series = 1, max_steps=train_steps, early_stop_patience_steps=early_stop_patience_steps),
TSMixerx(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, early_stop_patience_steps=early_stop_patience_steps),
PatchTST(h=horizon, input_size=input_size, max_steps=train_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
RNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
GRU(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
TCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
# DeepAR(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
BiTCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
DilatedRNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
MLP(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
DLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
NLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
TFT(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
FEDformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
StemGNN(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
MLPMultivariate(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
TiDE(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
DeepNPTS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps),
# VanillaTransformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), //报错了
# Autoformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), //报错了
# NBEATS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ),
# NBEATSx (h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard',activation='ReLU', ), //报错
# HINT(h=horizon),
]
if is_fivemodels:
# 获取之前存好的最好的五个模型
with open(os.path.join(dataset,'best_modelnames.txt'), 'r',encoding='utf-8') as f:
best_modelnames = f.readlines()[0]
logger.info(f'获取本地最佳模型名称:{best_modelnames}')
# 重新拼接models
all_models = models
models = []
for model in all_models:
if model._get_name() in best_modelnames:
models.append(model)
# 创建NeuralForecast实例并训练模型
nf = NeuralForecast(models=models, freq="B")
from joblib import dump, load
if is_train:
# 模型交叉验证
nf_preds = nf.cross_validation(df=df_train, val_size=val_size, test_size=test_size, n_windows=None)
nf_preds.to_csv(os.path.join(dataset,"cross_validation.csv"),index=False)
nf_preds = nf_preds.reset_index()
# 保存模型
# 生成文件名,按时间 精确到分
filename = f'{settings}--{now}.joblib'
#文件名去掉冒号
filename = filename.replace(':', '-') # 替换冒号
# dump(nf, os.path.join(dataset,filename))
else:
# glob获取dataset下最新的joblib文件
import glob
filename = max(glob.glob(os.path.join(dataset,'*.joblib')), key=os.path.getctime)
# logger.info('读取模型:'+ filename)
nf = load(filename)
# # 测试集预测
nf_test_preds = nf.cross_validation(df=df_test, val_size=val_size, test_size=test_size, n_windows=None)
# 测试集预测结果保存
nf_test_preds.to_csv(os.path.join(dataset,"cross_validation.csv"),index=False)
df_test['ds'] = pd.to_datetime(df_test['ds'], errors='coerce')
#进行未来时间预测
df_predict=nf.predict(df_test).reset_index()
df_predict.astype({col: 'float32' for col in df_predict.columns if col not in ['ds'] })
# 保存预测值
df_predict.to_csv(os.path.join(dataset,"predict.csv"),index=False)
# 把预测值上传到eta
if is_update_eta:
dates = df_predict['ds'].dt.strftime('%Y-%m-%d')
for m in modelsindex.keys():
list = []
for date,value in zip(dates,df_predict[m].round(2)):
list.append({'Date':date,'Value':value})
data['DataList'] = list
data['IndexCode'] = modelsindex[m]
data['IndexName'] = f'价格预测{m}模型'
data['Remark'] = m
etadata.push_data(data)
return nf_test_preds
# 计算预测评估指数
def model_losss(sqlitedb):
global dataset
# 预测数据处理 predict
df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv"))
df_combined = dateConvert(df_combined)
# 删除空列
df_combined.dropna(axis=1,inplace=True)
# 删除缺失值,预测过程不能有缺失值
df_combined.dropna(inplace=True)
# 其他列转为数值类型
df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] })
# 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值
df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max')
# 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列
df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']]
# 删除模型生成的cutoff列
df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True)
# 获取模型名称
modelnames = df_combined.columns.to_list()[2:]
if 'y' in modelnames:
modelnames.remove('y')
df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要
# 计算波动率
df_combined3['volatility'] = df_combined3['y'].pct_change().round(4)
# 计算近60日的波动率 10% 90%分位数
df_combined3['quantile_10'] = df_combined3['volatility'].rolling(60).quantile(0.1)
df_combined3['quantile_90'] = df_combined3['volatility'].rolling(60).quantile(0.9)
df_combined3 = df_combined3.round(4)
# 计算分位数对应的价格
df_combined3['quantile_10_price'] = df_combined3['y'] * (1 + df_combined3['quantile_10'])
df_combined3['quantile_90_price'] = df_combined3['y'] * (1 + df_combined3['quantile_90'])
# 遍历行
def find_min_max_within_quantile(row):
# 获取分位数10%和90%的值
q10 = row['quantile_10_price']
q90 = row['quantile_90_price']
# 判断flot值是否为空值
if pd.isna(q10) or pd.isna(q90):
return pd.Series([None, None, None, None], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# 初始化最小和最大值为None
min_value = None
max_value = None
min_value_model = ''
max_value_model = ''
# 遍历指定列,找出在分位数范围内的最大最小值
for model in modelnames:
value = row[model]
if value >= q10 and value <= q90:
if min_value is None or value < min_value:
min_value = value
min_value_model = model
if max_value is None or value > max_value:
max_value = value
max_value_model = model
# 返回最大最小值
return pd.Series([min_value, max_value,min_value_model,max_value_model], index=['min_within_quantile', 'max_within_quantile','min_model','max_model'])
# 应用函数到每一行
df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# 去除有空值的行
# df_combined3.dropna(inplace=True)
# 保存到数据库
df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)
df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
# 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE
cellText = []
# 遍历模型名称,计算模型评估指标
for model in modelnames:
modelmse = mse(df_combined['y'], df_combined[model])
modelrmse = rmse(df_combined['y'], df_combined[model])
modelmae = mae(df_combined['y'], df_combined[model])
# modelmape = mape(df_combined['y'], df_combined[model])
# modelsmape = smape(df_combined['y'], df_combined[model])
# modelr2 = r2_score(df_combined['y'], df_combined[model])
cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)])
model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)'])
# 按MSE降序排列
model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True)
model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False)
modelnames = model_results3['模型(Model)'].tolist()
allmodelnames = modelnames.copy()
# 保存5个最佳模型的名称
if len(modelnames) > 5:
modelnames = modelnames[0:5]
with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f:
f.write(','.join(modelnames) + '\n')
# 预测值与真实值对比图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(15, 10))
# 设置有5个子图的画布
for n,model in enumerate(modelnames):
plt.subplot(3, 2, n+1)
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
plt.plot(df_combined3['ds'], df_combined3[model], label=model)
plt.legend()
plt.xlabel('日期')
plt.ylabel('价格')
plt.title(model+'拟合')
plt.subplots_adjust(hspace=0.5)
plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight')
plt.close()
# 历史数据+预测数据
# 拼接未来时间预测
df_predict = loadcsv(os.path.join(dataset,'predict.csv'))
df_predict.drop('unique_id',inplace=True,axis=1)
df_predict.dropna(axis=1,inplace=True)
try:
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')
except ValueError :
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')
# 取第一行数据存储到数据库中
first_row = df_predict.head(1)
first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# 将预测结果保存到数据库
if not sqlitedb.check_table_exists('trueandpredict'):
first_row.to_sql('trueandpredict',sqlitedb.connection,index=False)
else:
for row in first_row.itertuples(index=False):
row_dict = row._asdict()
columns=row_dict.keys()
for col in columns:
sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT')
check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'")
if len(check_query) > 0:
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'")
continue
sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns)
# 最多频率的模型名称
min_model_max_frequency_model = df_combined3['min_model'].value_counts().idxmax()
max_model_max_frequency_model = df_combined3['max_model'].value_counts().idxmax()
df_predict['min_model'] = min_model_max_frequency_model
df_predict['max_model'] = max_model_max_frequency_model
df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model]
df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model]
df_predict2 = df_predict.copy()
df_predict2['ds'] = df_predict2['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# 将预测结果保存到数据库
# 判断表存在
if not sqlitedb.check_table_exists('testandpredict_groupby'):
df_predict2.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)
else:
for row in df_predict2.itertuples(index=False):
row_dict = row._asdict()
check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f"ds = '{row.ds}'")
if len(check_query) > 0:
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f"ds = '{row.ds}'")
continue
sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())
# 计算每个预测值与真实值之间的偏差率
for model in allmodelnames:
df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y']
# 获取每行对应的最小偏差率值
min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1)
# 获取每行对应的最小偏差率值对应的列名
min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1)
# 将列名索引转换为列名
min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0])
# 获取最小偏差率对应的模型的预测值
min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1)
# 将最小偏差率对应的模型的预测值添加到DataFrame中
df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions
df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name
df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)
# 判断 df 的数值列转为float
for col in df_combined3.columns:
try:
if col != 'ds':
df_combined3[col] = df_combined3[col].astype(float)
df_combined3[col] = df_combined3[col].round(2)
except ValueError:
pass
df_combined3.to_csv(os.path.join(dataset,"df_combined3.csv"),index=False)
# 历史价格+预测价格
df_combined3 = df_combined3[-50:] # 取50个数据点画图
# 历史价格
plt.figure(figsize=(20, 10))
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
# 颜色填充
plt.fill_between(df_combined3['ds'], df_combined3['min_within_quantile'], df_combined3['max_within_quantile'], alpha=0.2)
# plt.plot(df_combined3['ds'], df_combined3['min_abs_error_rate_prediction'], label='最小绝对误差', linestyle='--', color='orange')
# 网格
plt.grid(True)
# 显示历史值
for i, j in zip(df_combined3['ds'], df_combined3['y']):
plt.text(i, j, str(j), ha='center', va='bottom')
# 数据库查询最佳模型名称
most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]]
for model in most_model:
plt.plot(df_combined3['ds'], df_combined3[model], label=model,marker='o')
# 当前日期画竖虚线
plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--')
plt.legend()
plt.xlabel('日期')
plt.ylabel('价格')
plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')
plt.close()
# 预测值表格
fig, ax = plt.subplots(figsize=(20, 6))
ax.axis('off') # 关闭坐标轴
# 数值保留2位小数
df_combined3 = df_combined3.round(2)
df_combined3 = df_combined3[-horizon:]
df_combined3['Day'] = [f'Day_{i}' for i in range(1,horizon+1)]
# Day列放到最前面
df_combined3 = df_combined3[['Day'] + list(df_combined3.columns[:-1])]
table = ax.table(cellText=df_combined3.values, colLabels=df_combined3.columns, loc='center')
#加宽表格
table.auto_set_font_size(False)
table.set_fontsize(10)
# 设置表格样式,列数据最小的用绿色标识
plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight')
plt.close()
# plt.show()
# 可视化评估结果
plt.rcParams['font.sans-serif'] = ['SimHei']
fig, ax = plt.subplots(figsize=(20, 10))
ax.axis('off') # 关闭坐标轴
table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center')
# 加宽表格
table.auto_set_font_size(False)
table.set_fontsize(10)
# 设置表格样式,列数据最小的用绿色标识
plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight')
plt.close()
return model_results3
# 计算预测评估指数
def model_losss_juxiting(sqlitedb):
global dataset
most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]]
most_model_name = most_model[0]
# 预测数据处理 predict
df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv"))
df_combined = dateConvert(df_combined)
# 删除空列
df_combined.dropna(axis=1,inplace=True)
# 删除缺失值,预测过程不能有缺失值
df_combined.dropna(inplace=True)
# 其他列转为数值类型
df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] })
# 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值
df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max')
# 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列
df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']]
# 删除模型生成的cutoff列
df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True)
# 获取模型名称
modelnames = df_combined.columns.to_list()[1:]
if 'y' in modelnames:
modelnames.remove('y')
df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要
# 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE
cellText = []
# 遍历模型名称,计算模型评估指标
for model in modelnames:
modelmse = mse(df_combined['y'], df_combined[model])
modelrmse = rmse(df_combined['y'], df_combined[model])
modelmae = mae(df_combined['y'], df_combined[model])
# modelmape = mape(df_combined['y'], df_combined[model])
# modelsmape = smape(df_combined['y'], df_combined[model])
# modelr2 = r2_score(df_combined['y'], df_combined[model])
cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)])
model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)'])
# 按MSE降序排列
model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True)
model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False)
modelnames = model_results3['模型(Model)'].tolist()
allmodelnames = modelnames.copy()
# 保存5个最佳模型的名称
if len(modelnames) > 5:
modelnames = modelnames[0:5]
with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f:
f.write(','.join(modelnames) + '\n')
# 预测值与真实值对比图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(15, 10))
for n,model in enumerate(modelnames[:5]):
plt.subplot(3, 2, n+1)
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值')
plt.plot(df_combined3['ds'], df_combined3[model], label=model)
plt.legend()
plt.xlabel('日期')
plt.ylabel('价格')
plt.title(model+'拟合')
plt.subplots_adjust(hspace=0.5)
plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight')
plt.close()
# # 根据真实值y确定最大最小值,去掉最高最低的预测值
# import heapq # 使用堆来找到最大和最小的值
# def find_min_max_within_quantile(row):
# true_value = row['y']
# row.drop(['ds','y'], inplace=True)
# row = row.astype(float).round(2)
# max_heap = []
# min_heap = []
# for col in row.index:
# # 对比真实值进行分类
# if row[col] < true_value:
# heapq.heappush(min_heap, row[col])
# elif row[col] > true_value:
# heapq.heappush(max_heap, -row[col]) # 使用负号来实现最大堆
# if len(max_heap) == 1:
# max_y = max_heap[0]
# elif len(max_heap) == 0:
# max_y = -min_heap[-1]
# else:
# max_y = heapq.nsmallest(2, max_heap)[1]
# if len(min_heap) < 2 :
# min_y = -max_heap[-1]
# else:
# min_y = heapq.nsmallest(2, min_heap)[-1]
# # 获取最大和最小的值
# q10 = min_y
# q90 = -max_y
# # 获取最大和最小的模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmax()
# # 设置上下界比例
# rote = 1
# q10 = q10 * rote
# q90 = q90 * rote
# logger.info(min_model,q10,max_model,q90)
# return pd.Series([q10, q90, min_model, max_model], index=['min_within_quantile', 'max_within_quantile', 'min_model', 'max_model'])
# # # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
#使用最佳五个模型进行绘图
best_models = pd.read_csv(os.path.join(dataset,'best_modelnames.txt'),header=None).values.flatten().tolist()
def find_min_max_within_quantile(row):
row = row[best_models]
q10 = row.min()
q90 = row.max()
# 获取 row行最大最小值模型名称
min_model = row[row == q10].idxmin()
max_model = row[row == q90].idxmin()
# # 判断flot值是否为空值
# if pd.isna(q10) or pd.isna(q90):
return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# 遍历行
df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
df_combined = df_combined.round(4)
print(df_combined3)
# # 通道使用模型评估前80%作为置信度
# def find_min_max_within_quantile(row):
# row.drop(['ds','y'], inplace=True)
# row = row.astype(float).round(2)
# row_sorted = row
# # 计算 10% 和 90% 位置的索引
# index_10 = 0
# index_90 = int(len(row_sorted) * 0.8)
# q10 = row_sorted[index_10]
# q90 = row_sorted[index_90]
# # 获取模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmin()
# # # 判断flot值是否为空值
# # if pd.isna(q10) or pd.isna(q90):
# return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 重新排列
# df_combined3 = df_combined3[['ds','y'] + allmodelnames]
# # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
# # 通道使用预测模型的80%置信度
# def find_min_max_within_quantile(row):
# row.drop(['ds','y'], inplace=True)
# row = row.astype(float).round(2)
# row_sorted = row.sort_values(ascending=True).reset_index(drop=True)
# # 计算 10% 和 90% 位置的索引
# index_10 = int(len(row_sorted) * 0.1)
# index_90 = int(len(row_sorted) * 0.9)
# q10 = row_sorted[index_10]
# q90 = row_sorted[index_90]
# # 获取模型名称
# min_model = row[row == q10].idxmin()
# max_model = row[row == q90].idxmin()
# # # 判断flot值是否为空值
# # if pd.isna(q10) or pd.isna(q90):
# return pd.Series([q10, q90,min_model,max_model], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 遍历行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# df_combined = df_combined.round(4)
# print(df_combined3)
# # 计算波动率
# df_combined3['volatility'] = df_combined3['y'].pct_change().round(4)
# # 计算近60日的波动率 10% 90%分位数
# df_combined3['quantile_10'] = df_combined3['volatility'].rolling(60).quantile(0.1)
# df_combined3['quantile_90'] = df_combined3['volatility'].rolling(60).quantile(0.9)
# df_combined3 = df_combined3.round(4)
# # 计算分位数对应的价格
# df_combined3['quantile_10_price'] = df_combined3['y'] * (1 + df_combined3['quantile_10'])
# df_combined3['quantile_90_price'] = df_combined3['y'] * (1 + df_combined3['quantile_90'])
# # 遍历行
# def find_min_max_within_quantile(row):
# # 获取分位数10%和90%的值
# q10 = row['quantile_10_price']
# q90 = row['quantile_90_price']
# # 判断flot值是否为空值
# if pd.isna(q10) or pd.isna(q90):
# return pd.Series([None, None, None, None], index=['min_within_quantile','max_within_quantile','min_model','max_model'])
# # 初始化最小和最大值为None
# min_value = None
# max_value = None
# min_value_model = ''
# max_value_model = ''
# # 遍历指定列,找出在分位数范围内的最大最小值
# for model in modelnames:
# value = row[model]
# if value >= q10 and value <= q90:
# if min_value is None or value < min_value:
# min_value = value
# min_value_model = model
# if max_value is None or value > max_value:
# max_value = value
# max_value_model = model
# # 返回最大最小值
# return pd.Series([min_value, max_value,min_value_model,max_value_model], index=['min_within_quantile', 'max_within_quantile','min_model','max_model'])
# # 应用函数到每一行
# df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1)
# 去除有空值的行
# df_combined3.dropna(inplace=True)
# # 保存到数据库
# df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False)
# df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
'''
# 去掉方差最大的模型,其余模型预测最大最小值确定通道边界
# 历史数据+预测数据
# 拼接未来时间预测
df_predict = loadcsv(os.path.join(dataset,'predict.csv'))
df_predict.drop('unique_id',inplace=True,axis=1)
df_predict.dropna(axis=1,inplace=True)
df_predict2 = df_predict.copy()
try:
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')
except ValueError :
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')
# 取第一行数据存储到数据库中
first_row = df_predict.head(1)
first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# # 将预测结果保存到数据库
df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)
# # 判断 df 的数值列转为float
for col in df_combined3.columns:
try:
if col != 'ds':
df_combined3[col] = df_combined3[col].astype(float)
df_combined3[col] = df_combined3[col].round(2)
except ValueError:
pass
df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False)
df_combined3['ds'] = df_combined3['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# # 判断表存在
if not sqlitedb.check_table_exists('testandpredict_groupby'):
df_combined3.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)
else:
for row in df_combined3.itertuples(index=False):
row_dict = row._asdict()
check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f"ds = '{row.ds}'")
if len(check_query) > 0:
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f"ds = '{row.ds}'")
continue
sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())
ten_models = allmodelnames
# 计算每个模型的方差
variances = df_combined3[ten_models].var()
# 找到方差最大的模型
max_variance_model = variances.idxmax()
# 打印方差最大的模型
print("方差最大的模型是:", max_variance_model)
# 去掉方差最大的模型
df_combined3 = df_combined3.drop(columns=[max_variance_model])
if max_variance_model in allmodelnames:
allmodelnames.remove(max_variance_model)
df_combined3['min'] = df_combined3[allmodelnames].min(axis=1)
df_combined3['max'] = df_combined3[allmodelnames].max(axis=1)
print(df_combined3[['min','max']])
# 历史价格+预测价格
df_combined3 = df_combined3[-50:] # 取50个数据点画图
plt.figure(figsize=(20, 10))
plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值',marker='o')
plt.plot(df_combined3['ds'], df_combined3[most_model], label=most_model_name)
plt.fill_between(df_combined3['ds'], df_combined3['min'], df_combined3['max'], alpha=0.2)
plt.grid(True)
# # 显示历史值
for i, j in zip(df_combined3['ds'][:-5], df_combined3['y'][:-5]):
plt.text(i, j, str(j), ha='center', va='bottom')
# 当前日期画竖虚线
plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--')
plt.legend()
plt.xlabel('日期')
plt.ylabel('价格')
plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')
plt.close()
'''
# # 历史数据+预测数据
# # 拼接未来时间预测
df_predict = pd.read_csv(os.path.join(dataset,'predict.csv'))
df_predict.drop('unique_id',inplace=True,axis=1)
df_predict.dropna(axis=1,inplace=True)
try:
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d')
except ValueError :
df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d')
def first_row_to_database(df):
# # 取第一行数据存储到数据库中
first_row = df.head(1)
first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00')
# 将预测结果保存到数据库
if not sqlitedb.check_table_exists('trueandpredict'):
first_row.to_sql('trueandpredict',sqlitedb.connection,index=False)
else:
for col in first_row.columns:
sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT')
for row in first_row.itertuples(index=False):
row_dict = row._asdict()
columns=row_dict.keys()
check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'")
if len(check_query) > 0:
set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'")
continue
sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns)
first_row_to_database(df_predict)
def find_most_common_model():
# 最多频率的模型名称
min_model_max_frequency_model = df_combined3['min_model'].tail(20).value_counts().idxmax()
max_model_max_frequency_model = df_combined3['max_model'].tail(20).value_counts().idxmax()
if min_model_max_frequency_model == max_model_max_frequency_model:
# 取20天第二多的模型
max_model_max_frequency_model = df_combined3['max_model'].tail(20).value_counts().nlargest(2).index[1]
df_predict['min_model'] = min_model_max_frequency_model
df_predict['max_model'] = max_model_max_frequency_model
df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model]
df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model]
# find_most_common_model()
df_predict2 = df_predict.copy()
df_predict2['ds'] = pd.to_datetime(df_predict2['ds'])
df_predict2['ds'] = df_predict2['ds'].dt.strftime('%Y-%m-%d 00:00:00')
def _add_abs_error_rate():
# 计算每个预测值与真实值之间的偏差率
for model in allmodelnames:
df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y']
# 获取每行对应的最小偏差率值
min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1)
# 获取每行对应的最小偏差率值对应的列名
min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1)
# 将列名索引转换为列名
min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0])
# 获取最小偏差率对应的模型的预测值
min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1)
# 将最小偏差率对应的模型的预测值添加到DataFrame中
df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions
df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name
_add_abs_error_rate()
df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True)
# 判断 df 的数值列转为float
for col in df_combined3.columns:
try:
if col != 'ds':
df_combined3[col] = df_combined3[col].astype(float)
df_combined3[col] = df_combined3[col].round(2)
except ValueError:
pass
df_combined3.to_csv(os.path.join(dataset,"df_combined3.csv"),index=False)
# 历史价格+预测价格
# 将预测结果保存到数据库
# 判断表存在
# if not sqlitedb.check_table_exists('testandpredict_groupby'):
# df_combined3.to_sql('testandpredict_groupby',sqlitedb.connection,index=False)
# else:
# for row in df_combined3.itertuples(index=False):
# row_dict = row._asdict()
# check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f"ds = '{row.ds}'")
# if len(check_query) > 0:
# set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()])
# sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f"ds = '{row.ds}'")
# continue
# sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys())
def _plt_predict_ture(df):
df = df[-50:] # 取50个数据点画图
# 历史价格
plt.figure(figsize=(20, 10))
plt.plot(df['ds'], df['y'], label='真实值')
# 颜色填充
plt.fill_between(df['ds'], df['min_within_quantile'], df['max_within_quantile'], alpha=0.2)
# plt.plot(df_combined3['ds'], df_combined3['min_abs_error_rate_prediction'], label='最小绝对误差', linestyle='--', color='orange')
# 网格
plt.grid(True)
# 显示历史值
for i, j in zip(df['ds'], df['y']):
plt.text(i, j, str(j), ha='center', va='bottom')
for model in most_model:
plt.plot(df['ds'], df[model], label=model,marker='o')
# 当前日期画竖虚线
plt.axvline(x=df['ds'].iloc[-horizon], color='r', linestyle='--')
plt.legend()
plt.xlabel('日期')
plt.ylabel('价格')
plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight')
plt.close()
def _plt_predict_table(df):
# 预测值表格
fig, ax = plt.subplots(figsize=(20, 6))
ax.axis('off') # 关闭坐标轴
# 数值保留2位小数
df = df.round(2)
df = df[-horizon:]
df['Day'] = [f'Day_{i}' for i in range(1,horizon+1)]
# Day列放到最前面
df = df[['Day'] + list(df.columns[:-1])]
table = ax.table(cellText=df.values, colLabels=df.columns, loc='center')
#加宽表格
table.auto_set_font_size(False)
table.set_fontsize(10)
# 设置表格样式,列数据最小的用绿色标识
plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight')
plt.close()
def _plt_model_results3():
# 可视化评估结果
plt.rcParams['font.sans-serif'] = ['SimHei']
fig, ax = plt.subplots(figsize=(20, 10))
ax.axis('off') # 关闭坐标轴
table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center')
# 加宽表格
table.auto_set_font_size(False)
table.set_fontsize(10)
# 设置表格样式,列数据最小的用绿色标识
plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight')
plt.close()
_plt_predict_ture(df_combined3)
_plt_predict_table(df_combined3)
_plt_model_results3()
return model_results3
import matplotlib.dates as mdates
def brent_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsize=5,dataset='dataset',time = '2024-07-30',reportname='report.pdf',sqlitedb='jbsh_yuanyou.db'):
global y
# 创建内容对应的空列表
content = list()
# 获取特征的近一月值
import pandas as pd
feature_data_df = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'), parse_dates=['ds']).tail(20)
def draw_feature_trend(feature_data_df, features):
# 画特征近一周的趋势图
feature_df = feature_data_df[['ds','y']+features]
# 遍历X每一列和yy画散点图
for i, col in enumerate(features):
# try:
print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
if col not in ['ds', 'y']:
fig, ax1 = plt.subplots(figsize=(10, 6))
# 在第一个坐标轴上绘制数据
sns.lineplot(data=feature_df, x='ds', y='y', ax=ax1, color='b')
ax1.set_xlabel('日期')
ax1.set_ylabel('y', color='b')
ax1.tick_params('y', colors='b')
# 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
for j in range(1, len(feature_df), 2):
value = feature_df['y'].iloc[j]
date = feature_df['ds'].iloc[j]
offset = 1.001
ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# 创建第二个坐标轴
ax2 = ax1.twinx()
# 在第二个坐标轴上绘制数据
sns.lineplot(data=feature_df, x='ds', y=col, ax=ax2, color='r')
ax2.set_ylabel(col, color='r')
ax2.tick_params('y', colors='r')
# 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
for j in range(0, len(feature_df), 2):
value = feature_df[col].iloc[j]
date = feature_df['ds'].iloc[j]
offset = 1.0003
ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# 添加标题
plt.title(col)
# 设置横坐标为日期格式并自动调整
locator = mdates.AutoDateLocator()
formatter = mdates.AutoDateFormatter(locator)
ax1.xaxis.set_major_locator(locator)
ax1.xaxis.set_major_formatter(formatter)
# 文件名特殊字符处理
col = col.replace('*', '-')
col = col.replace(':', '-')
col = col.replace(r'/', '-')
plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
plt.close()
# except Exception as e:
# print(f'绘制第{i+1}个特征{col}与价格散点图时出错:{e}')
### 添加标题
content.append(Graphs.draw_title(f'{y}{time}预测报告'))
### 预测结果
content.append(Graphs.draw_little_title('一、预测结果:'))
# 添加历史走势及预测价格的走势图片
content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png')))
# 波动率画图逻辑
content.append(Graphs.draw_text('图示说明:'))
content.append(Graphs.draw_text('1. 确定波动率置信区间统计近60个交易日的真实价格波动率找出在 10% 90% 的分位值作为波动率置信区间;'))
content.append(Graphs.draw_text('2. 确定通道上界:在所有模型的预测结果中 <= 前一天真实价格 乘以 90%的置信波动分位数'))
content.append(Graphs.draw_text('3. 确定通道下界:在所有模型的预测结果中 >= 前一天真实价格 乘以 10%的置信波动分位数'))
content.append(Graphs.draw_text('4. 预测结果没有真实值作为参考依据通道上界取近20个交易日内预测在上界值的模型对应的预测值通道下界同理'))
content.append(Graphs.draw_text('5. 预测结果选用近20个交易日内最多接近真实值的模型的预测值对应的预测结果'))
content.append(Graphs.draw_text('6. 预测结果在通道外的,代表最接近真实值的预测结果不在置信波动范围内。'))
# 取df中y列为空的行
import pandas as pd
df = pd.read_csv(os.path.join(dataset,'predict.csv'),encoding='gbk')
df_true = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 获取预测日期对应的真实值
df_true = df_true[['ds','y']]
eval_df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8')
# 按评估指标排序,取前五
fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用
# 取 fivemodels_list 和 ds 列
df = df[['ds'] + fivemodels_list.tolist() ]
# 拼接预测日期对应的真实值
df = pd.merge(df, df_true, on='ds', how='left')
# 删除全部为nan的列
df = df.dropna(how='all', axis=1)
# 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入
num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])]
for col in num_cols:
df[col] = df[col].astype(float).round(2)
# 添加最大值、最小值、平均值三列
df['平均值'] = df[num_cols].mean(axis=1).round(2)
df['最大值'] = df[num_cols].max(axis=1)
df['最小值'] = df[num_cols].min(axis=1)
# df转置
df = df.T
# df重置索引
df = df.reset_index()
# 添加预测值表格
data = df.values.tolist()
col_width = 500/len(df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:'))
df = pd.read_csv(os.path.join(dataset,'testandpredict_groupby.csv'),encoding='utf-8')
df4 = df.copy() # 计算偏差率使用
# 计算模型偏差率
#计算各列对于y列的差值百分比
df3 = pd.DataFrame() # 存储偏差率
# 删除有null的行
df4 = df4.dropna()
df3['ds'] = df4['ds']
for col in fivemodels_list:
df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2)
# 找出决定系数前五的偏差率
df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:]
# 找出上一预测区间的时间
stime = df3['ds'].iloc[0]
etime = df3['ds'].iloc[-1]
# 添加偏差率表格
fivemodels = ''.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用
content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:'))
# # 添加偏差率表格
df3 = df3.T
df3 = df3.reset_index()
data = df3.values.tolist()
col_width = 500/len(df3.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_little_title('三、预测过程解析:'))
### 特征、模型、参数配置
content.append(Graphs.draw_little_title('模型选择:'))
content.append(Graphs.draw_text(f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:'))
content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。'))
content.append(Graphs.draw_little_title('指标情况:'))
with open(os.path.join(dataset,'特征频度统计.txt'),encoding='utf-8') as f:
for line in f.readlines():
content.append(Graphs.draw_text(line))
data = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 计算相关系数用
df_zhibiaofenlei = loadcsv(os.path.join(dataset,'特征处理后的指标名称及分类.csv')) # 气泡图用
df_zhibiaoshuju = data.copy() # 气泡图用
# 绘制特征相关气泡图
grouped = df_zhibiaofenlei.groupby('指标分类')
grouped_corr = pd.DataFrame(columns=['指标分类', '指标数量', '相关性总和'])
content.append(Graphs.draw_little_title('按指标分类分别与预测目标进行皮尔逊相关系数分析:'))
content.append(Graphs.draw_text('''皮尔逊相关系数说明:'''))
content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。'''))
content.append(Graphs.draw_text('''
相关系数为1表示两个变量之间存在完全正向的线性关系即当一个变量增加时另一个变量也相应增加且变化是完全一致的。'''))
content.append(Graphs.draw_text('''相关系数为-1表示两个变量之间存在完全负向的线性关系即当一个变量增加时另一个变量会相应减少且变化是完全相反的'''))
content.append(Graphs.draw_text('''相关系数接近0表示两个变量之间不存在线性关系即它们的变化不会随着对方的变化而变化。'''))
for name, group in grouped:
cols = group['指标名称'].tolist()
logger.info(f'开始绘制{name}类指标的相关性直方图')
cols_subset = cols
feature_names = ['y'] + cols_subset
correlation_matrix = df_zhibiaoshuju[feature_names].corr()['y']
# 绘制特征相关性直方分布图
plt.figure(figsize=(10,8))
sns.histplot(correlation_matrix.values.flatten(), bins=20, kde=True, color='skyblue')
plt.title(f'{name}类指标(共{len(cols_subset)}个)相关性直方分布图')
plt.xlabel('相关系数')
plt.ylabel('频数')
plt.savefig(os.path.join(dataset, f'{name}类指标相关性直方分布图.png'), bbox_inches='tight')
plt.close()
content.append(Graphs.draw_img(os.path.join(dataset,f'{name}类指标相关性直方分布图.png')))
content.append(Graphs.draw_text(f'{name}类指标(共{len(cols_subset)}个)的相关性直方分布图如上所示。'))
# 相关性大于0的特征
positive_corr_features = correlation_matrix[correlation_matrix > 0].sort_values(ascending=False).index.tolist()[1:]
print(f'{name}下正相关的特征值有:',positive_corr_features)
if len(positive_corr_features) > 5:
positive_corr_features = positive_corr_features[0:5]
content.append(Graphs.draw_text(f'{name}类指标中与预测目标y正相关前五的特征有{positive_corr_features}'))
draw_feature_trend(feature_data_df, positive_corr_features)
elif len(positive_corr_features) == 0:
pass
else:
positive_corr_features = positive_corr_features
content.append(Graphs.draw_text(f'其中与预测目标y正相关的特征有{positive_corr_features}'))
draw_feature_trend(feature_data_df, positive_corr_features)
# 相关性小于0的特征
negative_corr_features = correlation_matrix[correlation_matrix < 0].sort_values(ascending=True).index.tolist()
print(f'{name}下负相关的特征值有:',negative_corr_features)
if len(negative_corr_features) > 5:
negative_corr_features = negative_corr_features[:5]
content.append(Graphs.draw_text(f'与预测目标y负相关前五的特征有{negative_corr_features}'))
draw_feature_trend(feature_data_df, negative_corr_features)
elif len(negative_corr_features) == 0:
pass
else:
content.append(Graphs.draw_text(f'{name}类指标中与预测目标y负相关的特征有{negative_corr_features}'))
draw_feature_trend(feature_data_df, negative_corr_features)
# 计算correlation_sum 第一行的相关性的绝对值的总和
correlation_sum = correlation_matrix.abs().sum()
logger.info(f'{name}类指标的相关性总和为:{correlation_sum}')
# 分组的相关性总和拼接到grouped_corr
goup_corr = pd.DataFrame({'指标分类': [name], '指标数量': [len(cols_subset)], '相关性总和': [correlation_sum]})
grouped_corr = pd.concat([grouped_corr, goup_corr], axis=0, ignore_index=True)
# 绘制相关性总和的气泡图
logger.info(f'开始绘制相关性总和的气泡图')
plt.figure(figsize=(10, 10))
sns.scatterplot(data=grouped_corr, x='相关性总和', y='指标数量', size='相关性总和', sizes=(grouped_corr['相关性总和'].min()*5, grouped_corr['相关性总和'].max()*5), hue='指标分类', palette='viridis')
plt.title('指标分类相关性总和的气泡图')
plt.ylabel('数量')
plt.savefig(os.path.join(dataset, '指标分类相关性总和的气泡图.png'), bbox_inches='tight')
plt.close()
content.append(Graphs.draw_img(os.path.join(dataset,'指标分类相关性总和的气泡图.png')))
content.append(Graphs.draw_text('气泡图中,横轴为指标分类,纵轴为指标分类下的特征数量,气泡的面积越大表示该分类中特征的相关系数和越大。'))
logger.info(f'绘制相关性总和的气泡图结束')
# # 计算特征相关性
# data.rename(columns={y: 'y'}, inplace=True)
# data['ds'] = pd.to_datetime(data['ds'])
# data.drop(columns=['ds'], inplace=True)
# # 创建一个空的 DataFrame 来保存相关系数
# correlation_df = pd.DataFrame(columns=['Feature', 'Correlation'])
# # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 Data 中
# for col in data.columns:
# if col!= 'y':
# pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1]
# spearman_correlation, _ = spearmanr(data[col], data['y'])
# new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)}
# correlation_df = correlation_df._append(new_row, ignore_index=True)
# correlation_df.drop('Correlation', axis=1, inplace=True)
# correlation_df.dropna(inplace=True)
# correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False)
# data = correlation_df['Pearson_Correlation'].values.tolist()
# # 生成 -1 到 1 的 20 个区间
# bins = np.linspace(-1, 1, 21)
# # 计算每个区间的统计数(这里是区间内数据的数量)
# hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
# #设置画布大小
# plt.figure(figsize=(10, 6))
# # 绘制直方图
# plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# # 添加标题和坐标轴标签
# plt.title('皮尔逊相关系数分布图')
# plt.xlabel('区间')
# plt.ylabel('统计数')
# plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png'))
# plt.close()
# #设置画布大小
# plt.figure(figsize=(10, 6))
# data = correlation_df['Spearman_Correlation'].values.tolist()
# # 计算每个区间的统计数(这里是区间内数据的数量)
# hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
# # 绘制直方图
# plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# # 添加标题和坐标轴标签
# plt.title('斯皮尔曼相关系数分布图')
# plt.xlabel('区间')
# plt.ylabel('统计数')
# plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png'))
# plt.close()
# content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:'))
# # 皮尔逊正相关 不相关 负相关 的表格
# content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png')))
# content.append(Graphs.draw_text('''皮尔逊相关系数说明:'''))
# content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。'''))
# content.append(Graphs.draw_text('''
# 相关系数为1表示两个变量之间存在完全正向的线性关系即当一个变量增加时另一个变量也相应增加且变化是完全一致的。'''))
# content.append(Graphs.draw_text('''当前特征中正相关前十的有:'''))
# top10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature'].to_list()
# top10 = ','.join(top10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# feature_df = feature_data_df[['ds','y']+top10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(0,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:'))
# # 皮尔逊正相关 不相关 负相关 的表格
# content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png')))
# content.append(Graphs.draw_text('斯皮尔曼相关系数Spearmans rank correlation coefficient是一种用于衡量两个变量之间的单调关系不一定是线性关系的统计指标。'))
# content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。'))
# content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。'))
# content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;'))
# content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:'''))
# top10_columns = correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature'].to_list()
# top10 = ','.join(top10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# feature_df = feature_data_df[['ds','y']+top10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(0,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;'))
# content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:'''))
# tail10_columns = correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature'].to_list()
# top10 = ','.join(tail10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# # 获取特征的近一周值
# feature_df = feature_data_df[['ds','y']+tail10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(len(feature_df)):
# if j%2 == 1:
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。'))
# content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。'))
content.append(Graphs.draw_little_title('模型选择:'))
content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合通过评估指标MAE从小到大排列前5个模型的简介如下'))
### 读取模型简介
with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f:
for line in f:
line_split = line.strip().split('--')
if line_split[0] in fivemodels_list:
for introduction in line_split:
content.append(Graphs.draw_text(introduction))
content.append(Graphs.draw_little_title('模型评估:'))
df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8')
# 判断 df 的数值列转为float
for col in eval_df.columns:
if col not in ['模型(Model)']:
eval_df[col] = eval_df[col].astype(float)
eval_df[col] = eval_df[col].round(3)
# 筛选 fivemodels_list.tolist() 的行
eval_df = eval_df[eval_df['模型(Model)'].isin(fivemodels_list)]
# df转置
eval_df = eval_df.T
# df重置索引
eval_df = eval_df.reset_index()
eval_df = eval_df.T
# # 添加表格
data = eval_df.values.tolist()
col_width = 500/len(eval_df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_text('评估指标释义:'))
content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('模型拟合:'))
# 添加图片
content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png')))
# 附1特征列表
content.append(Graphs.draw_little_title('附1、特征列表'))
df_fuyi = pd.read_csv(os.path.join(dataset,'特征频度统计.csv'),encoding='utf-8')
for col in df_fuyi.columns:
fuyi = df_fuyi[col]
fuyi = fuyi.dropna()
content.append(Graphs.draw_text(f'{col}'))
for i in range(len(fuyi)):
content.append(Graphs.draw_text(f'{i+1}{fuyi[i]}'))
### 生成pdf文件
doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter)
# doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter)
doc.build(content)
# pdf 上传到数字化信息平台
# 读取pdf并转为base64
try:
if is_update_report:
with open(os.path.join(dataset,reportname), 'rb') as f:
base64_data = base64.b64encode(f.read()).decode('utf-8')
upload_data["data"]["fileBase64"] = base64_data
upload_data["data"]["fileName"] = reportname
token = get_head_auth_report()
upload_report_data(token, upload_data)
except TimeoutError as e:
print(f"请求超时: {e}")
def pp_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsize=5,dataset='dataset',time = '2024-07-30',reportname='report.pdf',sqlitedb='jbsh_yuanyou.db'):
global y
# 创建内容对应的空列表
content = list()
# 获取特征的近一月值
import pandas as pd
feature_data_df = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'), parse_dates=['ds']).tail(20)
def draw_feature_trend(feature_data_df, features):
# 画特征近一周的趋势图
feature_df = feature_data_df[['ds','y']+features]
# 遍历X每一列和yy画散点图
for i, col in enumerate(features):
# try:
print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
if col not in ['ds', 'y']:
fig, ax1 = plt.subplots(figsize=(10, 6))
# 在第一个坐标轴上绘制数据
sns.lineplot(data=feature_df, x='ds', y='y', ax=ax1, color='b')
ax1.set_xlabel('日期')
ax1.set_ylabel('y', color='b')
ax1.tick_params('y', colors='b')
# 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
for j in range(1, len(feature_df), 2):
value = feature_df['y'].iloc[j]
date = feature_df['ds'].iloc[j]
offset = 1.001
ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# 创建第二个坐标轴
ax2 = ax1.twinx()
# 在第二个坐标轴上绘制数据
sns.lineplot(data=feature_df, x='ds', y=col, ax=ax2, color='r')
ax2.set_ylabel(col, color='r')
ax2.tick_params('y', colors='r')
# 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
for j in range(0, len(feature_df), 2):
value = feature_df[col].iloc[j]
date = feature_df['ds'].iloc[j]
offset = 1.0003
ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# 添加标题
plt.title(col)
# 设置横坐标为日期格式并自动调整
locator = mdates.AutoDateLocator()
formatter = mdates.AutoDateFormatter(locator)
ax1.xaxis.set_major_locator(locator)
ax1.xaxis.set_major_formatter(formatter)
# 文件名特殊字符处理
col = col.replace('*', '-')
col = col.replace(':', '-')
col = col.replace(r'/', '-')
plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
plt.close()
# except Exception as e:
# print(f'绘制第{i+1}个特征{col}与价格散点图时出错:{e}')
### 添加标题
content.append(Graphs.draw_title(f'{y}{time}预测报告'))
### 预测结果
content.append(Graphs.draw_little_title('一、预测结果:'))
# 添加历史走势及预测价格的走势图片
content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png')))
# 根据真实值分组,去掉最高最小预测值画图逻辑
content.append(Graphs.draw_text('图示说明:'))
content.append(Graphs.draw_text('1. 将所有模型的预测结果进行分组,大于真实值的为一组,小于真实值的为一组,去掉最高的预测值,去掉最小的预测值'))
content.append(Graphs.draw_text('2. 确定通道上界:在大于真实值的分组中,取最大的预测值'))
content.append(Graphs.draw_text('3. 确定通道下界:在小于真实值的分组中,取第二小的预测值'))
content.append(Graphs.draw_text('4. 预测结果没有真实值作为参考依据通道上界取近20个交易日内预测在上界值的模型对应的预测值通道下界同理'))
content.append(Graphs.draw_text('5. 预测结果选用近20个交易日内最多接近真实值的模型的预测值对应的预测结果'))
content.append(Graphs.draw_text('6. 预测结果在通道外的,代表最接近真实值的预测结果不在置信波动范围内。'))
# 波动率画图逻辑
# content.append(Graphs.draw_text('图示说明:'))
# content.append(Graphs.draw_text('1. 确定波动率置信区间统计近60个交易日的真实价格波动率找出在 10% 90% 的分位值作为波动率置信区间;'))
# content.append(Graphs.draw_text('2. 确定通道上界:在所有模型的预测结果中 <= 前一天真实价格 乘以 90%的置信波动分位数'))
# content.append(Graphs.draw_text('3. 确定通道下界:在所有模型的预测结果中 >= 前一天真实价格 乘以 10%的置信波动分位数'))
# content.append(Graphs.draw_text('4. 预测结果没有真实值作为参考依据通道上界取近20个交易日内预测在上界值的模型对应的预测值通道下界同理'))
# content.append(Graphs.draw_text('5. 预测结果选用近20个交易日内最多接近真实值的模型的预测值对应的预测结果'))
# content.append(Graphs.draw_text('6. 预测结果在通道外的,代表最接近真实值的预测结果不在置信波动范围内。'))
# 取df中y列为空的行
import pandas as pd
df = pd.read_csv(os.path.join(dataset,'predict.csv'),encoding='gbk')
df_true = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 获取预测日期对应的真实值
df_true = df_true[['ds','y']]
eval_df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8')
# 按评估指标排序,取前五
fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用
# 取 fivemodels_list 和 ds 列
df = df[['ds'] + fivemodels_list.tolist() ]
# 拼接预测日期对应的真实值
df = pd.merge(df, df_true, on='ds', how='left')
# 删除全部为nan的列
df = df.dropna(how='all', axis=1)
# 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入
num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])]
for col in num_cols:
df[col] = df[col].astype(float).round(2)
# 添加最大值、最小值、平均值三列
df['平均值'] = df[num_cols].mean(axis=1).round(2)
df['最大值'] = df[num_cols].max(axis=1)
df['最小值'] = df[num_cols].min(axis=1)
# df转置
df = df.T
# df重置索引
df = df.reset_index()
# 添加预测值表格
data = df.values.tolist()
col_width = 500/len(df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:'))
df = pd.read_csv(os.path.join(dataset,'testandpredict_groupby.csv'),encoding='utf-8')
df4 = df.copy() # 计算偏差率使用
# 计算模型偏差率
#计算各列对于y列的差值百分比
df3 = pd.DataFrame() # 存储偏差率
# 删除有null的行
df4 = df4.dropna()
df3['ds'] = df4['ds']
for col in fivemodels_list:
df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2)
# 找出决定系数前五的偏差率
df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:]
# 找出上一预测区间的时间
stime = df3['ds'].iloc[0]
etime = df3['ds'].iloc[-1]
# 添加偏差率表格
fivemodels = ''.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用
content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:'))
# # 添加偏差率表格
df3 = df3.T
df3 = df3.reset_index()
data = df3.values.tolist()
col_width = 500/len(df3.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_little_title('三、预测过程解析:'))
### 特征、模型、参数配置
content.append(Graphs.draw_little_title('模型选择:'))
content.append(Graphs.draw_text(f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:'))
content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。'))
content.append(Graphs.draw_little_title('指标情况:'))
with open(os.path.join(dataset,'特征频度统计.txt'),encoding='utf-8') as f:
for line in f.readlines():
content.append(Graphs.draw_text(line))
data = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 计算相关系数用
df_zhibiaofenlei = loadcsv(os.path.join(dataset,'特征处理后的指标名称及分类.csv')) # 气泡图用
df_zhibiaoshuju = data.copy() # 气泡图用
# 绘制特征相关气泡图
grouped = df_zhibiaofenlei.groupby('指标分类')
grouped_corr = pd.DataFrame(columns=['指标分类', '指标数量', '相关性总和'])
content.append(Graphs.draw_little_title('按指标分类分别与预测目标进行皮尔逊相关系数分析:'))
content.append(Graphs.draw_text('''皮尔逊相关系数说明:'''))
content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。'''))
content.append(Graphs.draw_text('''
相关系数为1表示两个变量之间存在完全正向的线性关系即当一个变量增加时另一个变量也相应增加且变化是完全一致的。'''))
content.append(Graphs.draw_text('''相关系数为-1表示两个变量之间存在完全负向的线性关系即当一个变量增加时另一个变量会相应减少且变化是完全相反的'''))
content.append(Graphs.draw_text('''相关系数接近0表示两个变量之间不存在线性关系即它们的变化不会随着对方的变化而变化。'''))
for name, group in grouped:
cols = group['指标名称'].tolist()
logger.info(f'开始绘制{name}类指标的相关性直方图')
cols_subset = cols
feature_names = ['y'] + cols_subset
correlation_matrix = df_zhibiaoshuju[feature_names].corr()['y']
# 绘制特征相关性直方分布图
plt.figure(figsize=(10,8))
sns.histplot(correlation_matrix.values.flatten(), bins=20, kde=True, color='skyblue')
plt.title(f'{name}类指标(共{len(cols_subset)}个)相关性直方分布图')
plt.xlabel('相关系数')
plt.ylabel('频数')
plt.savefig(os.path.join(dataset, f'{name}类指标相关性直方分布图.png'), bbox_inches='tight')
plt.close()
content.append(Graphs.draw_img(os.path.join(dataset,f'{name}类指标相关性直方分布图.png')))
content.append(Graphs.draw_text(f'{name}类指标(共{len(cols_subset)}个)的相关性直方分布图如上所示。'))
# 相关性大于0的特征
positive_corr_features = correlation_matrix[correlation_matrix > 0].sort_values(ascending=False).index.tolist()[1:]
print(f'{name}下正相关的特征值有:',positive_corr_features)
if len(positive_corr_features) > 5:
positive_corr_features = positive_corr_features[0:5]
content.append(Graphs.draw_text(f'{name}类指标中与预测目标y正相关前五的特征有{positive_corr_features}'))
draw_feature_trend(feature_data_df, positive_corr_features)
elif len(positive_corr_features) == 0:
pass
else:
positive_corr_features = positive_corr_features
content.append(Graphs.draw_text(f'其中与预测目标y正相关的特征有{positive_corr_features}'))
draw_feature_trend(feature_data_df, positive_corr_features)
# 相关性小于0的特征
negative_corr_features = correlation_matrix[correlation_matrix < 0].sort_values(ascending=True).index.tolist()
print(f'{name}下负相关的特征值有:',negative_corr_features)
if len(negative_corr_features) > 5:
negative_corr_features = negative_corr_features[:5]
content.append(Graphs.draw_text(f'与预测目标y负相关前五的特征有{negative_corr_features}'))
draw_feature_trend(feature_data_df, negative_corr_features)
elif len(negative_corr_features) == 0:
pass
else:
content.append(Graphs.draw_text(f'{name}类指标中与预测目标y负相关的特征有{negative_corr_features}'))
draw_feature_trend(feature_data_df, negative_corr_features)
# 计算correlation_sum 第一行的相关性的绝对值的总和
correlation_sum = correlation_matrix.abs().sum()
logger.info(f'{name}类指标的相关性总和为:{correlation_sum}')
# 分组的相关性总和拼接到grouped_corr
goup_corr = pd.DataFrame({'指标分类': [name], '指标数量': [len(cols_subset)], '相关性总和': [correlation_sum]})
grouped_corr = pd.concat([grouped_corr, goup_corr], axis=0, ignore_index=True)
# 绘制相关性总和的气泡图
logger.info(f'开始绘制相关性总和的气泡图')
plt.figure(figsize=(10, 10))
sns.scatterplot(data=grouped_corr, x='相关性总和', y='指标数量', size='相关性总和', sizes=(grouped_corr['相关性总和'].min()*5, grouped_corr['相关性总和'].max()*5), hue='指标分类', palette='viridis')
plt.title('指标分类相关性总和的气泡图')
plt.ylabel('数量')
plt.savefig(os.path.join(dataset, '指标分类相关性总和的气泡图.png'), bbox_inches='tight')
plt.close()
content.append(Graphs.draw_img(os.path.join(dataset,'指标分类相关性总和的气泡图.png')))
content.append(Graphs.draw_text('气泡图中,横轴为指标分类,纵轴为指标分类下的特征数量,气泡的面积越大表示该分类中特征的相关系数和越大。'))
logger.info(f'绘制相关性总和的气泡图结束')
# # 计算特征相关性
# data.rename(columns={y: 'y'}, inplace=True)
# data['ds'] = pd.to_datetime(data['ds'])
# data.drop(columns=['ds'], inplace=True)
# # 创建一个空的 DataFrame 来保存相关系数
# correlation_df = pd.DataFrame(columns=['Feature', 'Correlation'])
# # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 Data 中
# for col in data.columns:
# if col!= 'y':
# pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1]
# spearman_correlation, _ = spearmanr(data[col], data['y'])
# new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)}
# correlation_df = correlation_df._append(new_row, ignore_index=True)
# correlation_df.drop('Correlation', axis=1, inplace=True)
# correlation_df.dropna(inplace=True)
# correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False)
# data = correlation_df['Pearson_Correlation'].values.tolist()
# # 生成 -1 到 1 的 20 个区间
# bins = np.linspace(-1, 1, 21)
# # 计算每个区间的统计数(这里是区间内数据的数量)
# hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
# #设置画布大小
# plt.figure(figsize=(10, 6))
# # 绘制直方图
# plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# # 添加标题和坐标轴标签
# plt.title('皮尔逊相关系数分布图')
# plt.xlabel('区间')
# plt.ylabel('统计数')
# plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png'))
# plt.close()
# #设置画布大小
# plt.figure(figsize=(10, 6))
# data = correlation_df['Spearman_Correlation'].values.tolist()
# # 计算每个区间的统计数(这里是区间内数据的数量)
# hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
# # 绘制直方图
# plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# # 添加标题和坐标轴标签
# plt.title('斯皮尔曼相关系数分布图')
# plt.xlabel('区间')
# plt.ylabel('统计数')
# plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png'))
# plt.close()
# content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:'))
# # 皮尔逊正相关 不相关 负相关 的表格
# content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png')))
# content.append(Graphs.draw_text('''皮尔逊相关系数说明:'''))
# content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。'''))
# content.append(Graphs.draw_text('''
# 相关系数为1表示两个变量之间存在完全正向的线性关系即当一个变量增加时另一个变量也相应增加且变化是完全一致的。'''))
# content.append(Graphs.draw_text('''当前特征中正相关前十的有:'''))
# top10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature'].to_list()
# top10 = ','.join(top10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# feature_df = feature_data_df[['ds','y']+top10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(0,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:'))
# # 皮尔逊正相关 不相关 负相关 的表格
# content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png')))
# content.append(Graphs.draw_text('斯皮尔曼相关系数Spearmans rank correlation coefficient是一种用于衡量两个变量之间的单调关系不一定是线性关系的统计指标。'))
# content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。'))
# content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。'))
# content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;'))
# content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:'''))
# top10_columns = correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature'].to_list()
# top10 = ','.join(top10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# feature_df = feature_data_df[['ds','y']+top10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(0,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;'))
# content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:'''))
# tail10_columns = correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature'].to_list()
# top10 = ','.join(tail10_columns)
# content.append(Graphs.draw_text(f'''{top10}'''))
# # 获取特征的近一周值
# feature_df = feature_data_df[['ds','y']+tail10_columns]
# # 遍历X每一列和yy画散点图
# for i, col in enumerate(feature_df.columns):
# print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
# if col not in ['ds', 'y']:
# fig, ax1 = plt.subplots(figsize=(10, 6))
# # 在第一个坐标轴上绘制数据
# ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
# ax1.set_xlabel('日期')
# ax1.set_ylabel('y', color='b')
# ax1.tick_params('y', colors='b')
# # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(len(feature_df)):
# if j%2 == 1:
# value = feature_df['y'].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# # 创建第二个坐标轴
# ax2 = ax1.twinx()
# # 在第二个坐标轴上绘制数据
# line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
# ax2.set_ylabel(col, color='r')
# ax2.tick_params('y', colors='r')
# # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
# for j in range(1,len(feature_df),2):
# value = feature_df[col].iloc[j]
# date = feature_df['ds'].iloc[j]
# offset = 1.001
# ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# # 添加标题
# plt.title(col)
# # 设置横坐标为日期格式并自动调整
# locator = mdates.AutoDateLocator()
# formatter = mdates.AutoDateFormatter(locator)
# ax1.xaxis.set_major_locator(locator)
# ax1.xaxis.set_major_formatter(formatter)
# # 文件名特殊字符处理
# col = col.replace('*', '-')
# col = col.replace(':', '-')
# plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
# content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
# plt.close()
# content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。'))
# content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。'))
content.append(Graphs.draw_little_title('模型选择:'))
content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合通过评估指标MAE从小到大排列前5个模型的简介如下'))
### 读取模型简介
with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f:
for line in f:
line_split = line.strip().split('--')
if line_split[0] in fivemodels_list:
for introduction in line_split:
content.append(Graphs.draw_text(introduction))
content.append(Graphs.draw_little_title('模型评估:'))
df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8')
# 判断 df 的数值列转为float
for col in eval_df.columns:
if col not in ['模型(Model)']:
eval_df[col] = eval_df[col].astype(float)
eval_df[col] = eval_df[col].round(3)
# 筛选 fivemodels_list.tolist() 的行
eval_df = eval_df[eval_df['模型(Model)'].isin(fivemodels_list)]
# df转置
eval_df = eval_df.T
# df重置索引
eval_df = eval_df.reset_index()
eval_df = eval_df.T
# # 添加表格
data = eval_df.values.tolist()
col_width = 500/len(eval_df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_text('评估指标释义:'))
content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('模型拟合:'))
# 添加图片
content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png')))
# 附1特征列表
content.append(Graphs.draw_little_title('附1、特征列表'))
df_fuyi = pd.read_csv(os.path.join(dataset,'特征频度统计.csv'),encoding='utf-8')
for col in df_fuyi.columns:
fuyi = df_fuyi[col]
fuyi = fuyi.dropna()
content.append(Graphs.draw_text(f'{col}'))
for i in range(len(fuyi)):
content.append(Graphs.draw_text(f'{i+1}{fuyi[i]}'))
### 生成pdf文件
doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter)
# doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter)
doc.build(content)
# pdf 上传到数字化信息平台
# 读取pdf并转为base64
try:
if is_update_report:
with open(os.path.join(dataset,reportname), 'rb') as f:
base64_data = base64.b64encode(f.read()).decode('utf-8')
upload_data["data"]["fileBase64"] = base64_data
upload_data["data"]["fileName"] = reportname
token = get_head_auth_report()
upload_report_data(token, upload_data)
except TimeoutError as e:
print(f"请求超时: {e}")
def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inputsize=5,dataset='dataset',time = '2024-07-30',reportname='report.pdf'):
global y
# 创建内容对应的空列表
content = list()
### 添加标题
content.append(Graphs.draw_title(f'{y}{time}预测报告'))
### 预测结果
content.append(Graphs.draw_little_title('一、预测结果:'))
# 添加图片
# 找出后缀是历史价格-预测值.png的图片
# import glob
# imgs = glob.glob(os.path.join(dataset,'*历史价格-预测值.png'))
# for img in imgs:
# content.append(Graphs.draw_img(img))
content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png')))
# 取df中y列为空的行
import pandas as pd
df = pd.read_csv(os.path.join(dataset,'predict.csv'),encoding='gbk')
df_true = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 获取预测日期对应的真实值
df_true = df_true[['ds','y']]
eval_df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8')
# 按评估指标排序,取前五
fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用
# 取 fivemodels_list 和 ds 列
df = df[['ds'] + fivemodels_list.tolist() ]
# 拼接预测日期对应的真实值
df = pd.merge(df, df_true, on='ds', how='left')
# 删除全部为nan的列
df = df.dropna(how='all', axis=1)
# 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入
num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])]
for col in num_cols:
df[col] = df[col].astype(float).round(2)
# 添加最大值、最小值、平均值三列
df['平均值'] = df[num_cols].mean(axis=1).round(2)
df['最大值'] = df[num_cols].max(axis=1)
df['最小值'] = df[num_cols].min(axis=1)
# df转置
df = df.T
# df重置索引
df = df.reset_index()
# 添加预测值表格
data = df.values.tolist()
col_width = 500/len(df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:'))
df = pd.read_csv(os.path.join(dataset,'testandpredict_groupby.csv'),encoding='utf-8')
df4 = df.copy() # 计算偏差率使用
# 计算模型偏差率
#计算各列对于y列的差值百分比
df3 = pd.DataFrame() # 存储偏差率
# 删除有null的行
df4 = df4.dropna()
df3['ds'] = df4['ds']
for col in df.columns:
if col not in ['y','ds','index']:
df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2)
# 找出决定系数前五的偏差率
df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:]
# 找出上一预测区间的时间
stime = df3['ds'].iloc[0]
etime = df3['ds'].iloc[-1]
# 添加偏差率表格
fivemodels = ''.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用
content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:'))
# # 添加偏差率表格
df3 = df3.T
df3 = df3.reset_index()
data = df3.values.tolist()
col_width = 500/len(df3.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_little_title('三、预测过程解析:'))
### 特征、模型、参数配置
content.append(Graphs.draw_little_title('模型选择:'))
content.append(Graphs.draw_text(f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:'))
content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。'))
content.append(Graphs.draw_little_title('指标情况:'))
with open(os.path.join(dataset,'特征频度统计.txt'),encoding='utf-8') as f:
for line in f.readlines():
content.append(Graphs.draw_text(line))
### 特征工程
# 计算特征相关性
# 读取数据
from scipy.stats import spearmanr
data = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8')
# 重命名预测列
data.rename(columns={y: 'y'}, inplace=True) # 修改
data['ds'] = pd.to_datetime(data['ds']) # 修改
# 去掉ds列
data.drop(columns=['ds'], inplace=True)
# 创建一个空的 DataFrame 来保存相关系数
correlation_df = pd.DataFrame(columns=['Feature', 'Correlation'])
# 计算各特征与目标列的皮尔逊相关系数,并保存到新的 DataFrame 中
for col in data.columns:
if col!= 'y':
pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1]
spearman_correlation, _ = spearmanr(data[col], data['y'])
new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)}
correlation_df = correlation_df._append(new_row, ignore_index=True)
# 删除空列
correlation_df.drop('Correlation', axis=1, inplace=True)
correlation_df.dropna(inplace=True)
correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False)
data = correlation_df['Pearson_Correlation'].values.tolist()
# 生成 -1 到 1 的 20 个区间
bins = np.linspace(-1, 1, 21)
# 计算每个区间的统计数(这里是区间内数据的数量)
hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
#设置画布大小
plt.figure(figsize=(10, 6))
# 绘制直方图
plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# 添加标题和坐标轴标签
plt.title('皮尔逊相关系数分布图')
plt.xlabel('区间')
plt.ylabel('统计数')
plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png'))
plt.close()
#设置画布大小
plt.figure(figsize=(10, 6))
data = correlation_df['Spearman_Correlation'].values.tolist()
# 计算每个区间的统计数(这里是区间内数据的数量)
hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
# 绘制直方图
plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# 添加标题和坐标轴标签
plt.title('斯皮尔曼相关系数分布图')
plt.xlabel('区间')
plt.ylabel('统计数')
plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png'))
plt.close()
content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:'))
# 皮尔逊正相关 不相关 负相关 的表格
content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png')))
content.append(Graphs.draw_text('''皮尔逊相关系数说明:'''))
content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。'''))
content.append(Graphs.draw_text('''
相关系数为1表示两个变量之间存在完全正向的线性关系即当一个变量增加时另一个变量也相应增加且变化是完全一致的。'''))
content.append(Graphs.draw_text('''当前特征中正相关前十的有:'''))
top10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature'].to_list()
top10 = ','.join(top10_columns)
content.append(Graphs.draw_text(f'''{top10}'''))
# 获取特征的近一月值
feature_data_df = pd.read_csv(os.path.join(dataset,'填充后的特征数据.csv'), parse_dates=['ds']).tail(20)
feature_df = feature_data_df[['ds','y']+top10_columns]
# feature_df['ds'] = pd.to_datetime(df['ds'], format = '%Y-%m-%d' )
# 遍历X每一列和yy画散点图
for i, col in enumerate(feature_df.columns):
print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
if col not in ['ds', 'y']:
fig, ax1 = plt.subplots(figsize=(10, 6))
# 在第一个坐标轴上绘制数据
ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
ax1.set_xlabel('日期')
ax1.set_ylabel('y', color='b')
ax1.tick_params('y', colors='b')
# 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
for j in range(1,len(feature_df),2):
value = feature_df['y'].iloc[j]
date = feature_df['ds'].iloc[j]
offset = 1.001
ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# 创建第二个坐标轴
ax2 = ax1.twinx()
# 在第二个坐标轴上绘制数据
line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
ax2.set_ylabel(col, color='r')
ax2.tick_params('y', colors='r')
# 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
for j in range(0,len(feature_df),2):
value = feature_df[col].iloc[j]
date = feature_df['ds'].iloc[j]
offset = 1.001
ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# 添加标题
plt.title(col)
# 设置横坐标为日期格式并自动调整
locator = mdates.AutoDateLocator()
formatter = mdates.AutoDateFormatter(locator)
ax1.xaxis.set_major_locator(locator)
ax1.xaxis.set_major_formatter(formatter)
# 文件名特殊字符处理
col = col.replace('*', '-')
col = col.replace(':', '-')
plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
plt.close()
content.append(Graphs.draw_text('''相关系数为-1表示两个变量之间存在完全负向的线性关系即当一个变量增加时另一个变量会相应减少且变化是完全相反的'''))
content.append(Graphs.draw_text('''当前特征中负相关前十的有:'''))
tail10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=True).head(10)['Feature'].to_list()
top10 = ','.join(tail10_columns)
content.append(Graphs.draw_text(f'''{top10}'''))
# 获取特征的近一周值
feature_df = feature_data_df[['ds','y']+tail10_columns]
# 遍历X每一列和yy画散点图
for i, col in enumerate(feature_df.columns):
print(f'正在绘制第{i+1}个特征{col}与价格散点图...')
if col not in ['ds', 'y']:
fig, ax1 = plt.subplots(figsize=(10, 6))
# 在第一个坐标轴上绘制数据
ax1.plot(feature_df['ds'], feature_df['y'], 'b-')
ax1.set_xlabel('日期')
ax1.set_ylabel('y', color='b')
ax1.tick_params('y', colors='b')
# 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠
for j in range(len(feature_df)):
if j%2 == 1:
value = feature_df['y'].iloc[j]
date = feature_df['ds'].iloc[j]
offset = 1.001
ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10)
# 创建第二个坐标轴
ax2 = ax1.twinx()
# 在第二个坐标轴上绘制数据
line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-')
ax2.set_ylabel(col, color='r')
ax2.tick_params('y', colors='r')
# 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠
for j in range(1,len(feature_df),2):
value = feature_df[col].iloc[j]
date = feature_df['ds'].iloc[j]
offset = 1.001
ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10)
# 添加标题
plt.title(col)
# 设置横坐标为日期格式并自动调整
locator = mdates.AutoDateLocator()
formatter = mdates.AutoDateFormatter(locator)
ax1.xaxis.set_major_locator(locator)
ax1.xaxis.set_major_formatter(formatter)
# 文件名特殊字符处理
col = col.replace('*', '-')
col = col.replace(':', '-')
plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png'))
content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png')))
plt.close()
content.append(Graphs.draw_text('''相关系数接近0表示两个变量之间不存在线性关系即它们的变化不会随着对方的变化而变化。'''))
content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:'))
# 皮尔逊正相关 不相关 负相关 的表格
content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png')))
content.append(Graphs.draw_text('斯皮尔曼相关系数Spearmans rank correlation coefficient是一种用于衡量两个变量之间的单调关系不一定是线性关系的统计指标。'))
content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。'))
content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。'))
content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;'))
content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:'''))
top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature'])
content.append(Graphs.draw_text(f'''{top10}'''))
content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;'))
content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:'''))
top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature'])
content.append(Graphs.draw_text(f'''{top10}'''))
content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。'))
content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。'))
content.append(Graphs.draw_little_title('模型选择:'))
content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合通过评估指标MAE从小到大排列前5个模型的简介如下'))
### 读取模型简介
with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f:
for line in f:
line_split = line.strip().split('--')
if line_split[0] in fivemodels_list:
for introduction in line_split:
content.append(Graphs.draw_text(introduction))
content.append(Graphs.draw_little_title('模型评估:'))
df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8')
# 判断 df 的数值列转为float
for col in eval_df.columns:
if col not in ['模型(Model)']:
eval_df[col] = eval_df[col].astype(float)
eval_df[col] = eval_df[col].round(3)
# 筛选 fivemodels_list.tolist() 的行
eval_df = eval_df[eval_df['模型(Model)'].isin(fivemodels_list)]
# df转置
eval_df = eval_df.T
# df重置索引
eval_df = eval_df.reset_index()
eval_df = eval_df.T
# # 添加表格
data = eval_df.values.tolist()
col_width = 500/len(eval_df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_text('评估指标释义:'))
content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('模型拟合:'))
# 添加图片
content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png')))
# 附1特征列表
content.append(Graphs.draw_little_title('附1、特征列表'))
df_fuyi = pd.read_csv(os.path.join(dataset,'特征频度统计.csv'),encoding='utf-8')
for col in df_fuyi.columns:
fuyi = df_fuyi[col]
fuyi = fuyi.dropna()
content.append(Graphs.draw_text(f'{col}'))
for i in range(len(fuyi)):
content.append(Graphs.draw_text(f'{i+1}{fuyi[i]}'))
### 生成pdf文件
doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter)
# doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter)
doc.build(content)
# pdf 上传到数字化信息平台
# 读取pdf并转为base64
try:
if is_update_report:
with open(os.path.join(dataset,reportname), 'rb') as f:
base64_data = base64.b64encode(f.read()).decode('utf-8')
upload_data["data"]["fileBase64"] = base64_data
upload_data["data"]["fileName"] = reportname
token = get_head_auth_report()
upload_report_data(token, upload_data)
except TimeoutError as e:
print(f"请求超时: {e}")
def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202,inputsize=5,dataset='dataset',y='电碳价格',end_time='2024-07-30',reportname='tansuanli.pdf'):
# 创建内容对应的空列表
content = list()
### 添加标题
content.append(Graphs.draw_title(f'{y}{end_time}预测报告'))
### 预测结果
content.append(Graphs.draw_little_title('一、预测结果:'))
content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png')))
# 取df中y列为空的行
from lib.dataread import loadcsv
df = loadcsv(os.path.join(dataset,'predict.csv'))
df_true = loadcsv(os.path.join(dataset,'指标数据添加时间特征.csv')) # 获取预测日期对应的真实值
df_true = df_true[['ds','y']]
eval_df = loadcsv(os.path.join(dataset,'model_evaluation.csv'))
# 按评估指标排序,取前五
fivemodels_list = eval_df['模型(Model)'].values[:5] # 列表形式,后面当作列名索引使用
# 取 fivemodels_list 和 ds 列
df = df[['ds'] + fivemodels_list.tolist() ]
# 拼接预测日期对应的真实值
df = pd.merge(df, df_true, on='ds', how='left')
# 删除全部为nan的列
df = df.dropna(how='all', axis=1)
# 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入
num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])]
for col in num_cols:
df[col] = df[col].astype(float).round(2)
# 添加预测每日的最大值、最小值、平均值三列
df['平均值'] = df[num_cols].mean(axis=1).round(2)
df['最大值'] = df[num_cols].max(axis=1)
df['最小值'] = df[num_cols].min(axis=1)
# 添加模型预测周期内的最大值、最小值、平均值三行
# 计算列的统计值
mean_values = df[num_cols].mean(axis=0).round(2)
max_values = df[num_cols].max(axis=0)
min_values = df[num_cols].min(axis=0)
# 创建一个新的 DataFrame 来存储统计行
stats_row = pd.DataFrame([mean_values, max_values, min_values], index=[0,1,2])
stats_row['ds'] = ['平均值', '最大值', '最小值']
# 将统计行添加到原始 DataFrame
df = pd.concat([df, stats_row], axis=0)
# df替换nan 为 '--'
df = df.fillna('--')
# df转置
df = df.T
# df重置索引
df = df.reset_index()
# 添加预测值表格
data = df.values.tolist()
col_width = 500/len(df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:'))
df = loadcsv(os.path.join(dataset,'testandpredict_groupby.csv'))
df4 = df.copy() # 计算偏差率使用
# 计算模型偏差率
#计算各列对于y列的差值百分比
df3 = pd.DataFrame() # 存储偏差率
# 删除有null的行
df4 = df4.dropna()
df3['ds'] = df4['ds']
for col in df.columns:
if col not in ['y','ds','index']:
df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2)
# 找出决定系数前五的偏差率
df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:]
# 找出上一预测区间的时间
stime = df3['ds'].iloc[0]
etime = df3['ds'].iloc[-1]
# 添加偏差率表格
fivemodels = ''.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用
content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:'))
# # 添加偏差率表格
df3 = df3.T
df3 = df3.reset_index()
df3 = df3.T
data = df3.values.tolist()
col_width = 500/len(df3.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_little_title('三、预测过程解析:'))
### 特征、模型、参数配置
content.append(Graphs.draw_text(f'本次预测使用了给定的28个指标列名重复的排除后作为特征应用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型。'))
content.append(Graphs.draw_text(f'使用10天的数据预测未来{inputsize}天的数据。'))
content.append(Graphs.draw_little_title('指标情况:'))
content.append(Graphs.draw_text(' 指标频度包括'))
# 添加频度统计表格
pindu_df = loadcsv(os.path.join(dataset,'特征频度统计.csv'))
pindu_df.fillna('-', inplace=True)
pindu_df = pindu_df.T
pindu_df = pindu_df.reset_index()
pindu_df = pindu_df.T
data = pindu_df.values.tolist()
col_width = 500/len(pindu_df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_text(f'从指标特征的频度信息来看,月度指标占比最高,而我们需要进行预测的指标为日度的,所以本数据集中月度和周度指标需要进行插值处理。'))
content.append(Graphs.draw_text(' 数据特征工程:'))
content.append(Graphs.draw_text('1. 数据日期排序,新日期在最后'))
content.append(Graphs.draw_text('2. 删除空列,特征数据列没有值,就删除'))
content.append(Graphs.draw_text('3. 周度、月度特征填充为日度数据,填充规则:'))
content.append(Graphs.draw_text(' -- 向后填充,举例:假设周五出现一个周度指标数据,那么在这之前的数据用上周五的数据'))
content.append(Graphs.draw_text(' -- 向前填充举例采集数据开始日期为2018年1月1日那么周度数据可能是2018年1月3日那么3日的数据向前填充使1日2日都有数值'))
content.append(Graphs.draw_text(f'以上处理其实并不合理,但结合我们想要的结果,我们选择了这种处理方式。'))
content.append(Graphs.draw_text(f'一般来讲,指标数据的频度和预测列是一致的,我们可以考虑预测月度的目标列,不过这样的话,月度数据太少了,不足以用来训练模型。'))
### 特征工程
# 预测列分析
content.append(Graphs.draw_text(' 电碳价格自相关ACF和偏自相关PACF分析'))
content.append(Graphs.draw_img(os.path.join(dataset,'指标数据自相关图.png')))
content.append(Graphs.draw_img(os.path.join(dataset,'指标数据偏自相关图.png')))
content.append(Graphs.draw_text(' 解读:'))
content.append(Graphs.draw_text(' 自相关函数的取值范围为 [-1, 1]。正值表示信号在不同时间点之间具有正相关性,负值表示信号具有负相关性,而 0 表示信号在不同时间点之间不相关。 '))
content.append(Graphs.draw_text(' 偏自相关函数PACF则是在控制了中间的滞后项影响后特定滞后项与当前项的相关性。 '))
content.append(Graphs.draw_text(' 当前目标列表现出的 ACF 呈现出拖尾的特征,而 PACF 在1个滞后阶数后截尾这说明目标值适合使用自回归AR模型 '))
content.append(Graphs.draw_text(' 数据特征可视化分析:'))
# 找出所有后缀为散点图.png的文件
import glob
scatter_files = glob.glob(os.path.join(dataset,'*散点图.png'))
for file in scatter_files:
content.append(Graphs.draw_img(file))
content.append(Graphs.draw_text(' 解读:'))
content.append(Graphs.draw_text(' 观察特征与目标列的散点图,我们可以直观的感受到特征与我们要预测的列没有明显的趋势相关,需要考虑选取的特征合理。 '))
content.append(Graphs.draw_text(' 数据特征相关性分析:'))
# 计算特征相关性
# 读取数据
from scipy.stats import spearmanr
data = loadcsv(os.path.join(dataset,'指标数据添加时间特征.csv'))
# 重命名预测列
data.rename(columns={y: 'y'}, inplace=True) # 修改
from lib.tools import dateConvert
data = dateConvert(data) # 修改
# 去掉ds列
data.drop(columns=['ds'], inplace=True)
# 创建一个空的 DataFrame 来保存相关系数
correlation_df = pd.DataFrame(columns=['Feature', 'Correlation'])
# 计算各特征与目标列的皮尔逊相关系数,并保存到新的 DataFrame 中
for col in data.columns:
if col!= 'y':
pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1]
spearman_correlation, _ = spearmanr(data[col], data['y'])
new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)}
correlation_df = correlation_df._append(new_row, ignore_index=True)
# 删除空列
correlation_df.drop('Correlation', axis=1, inplace=True)
correlation_df.dropna(inplace=True)
correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False)
data = correlation_df['Pearson_Correlation'].values.tolist()
# 生成 -1 到 1 的 20 个区间
bins = np.linspace(-1, 1, 21)
# 计算每个区间的统计数(这里是区间内数据的数量)
hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
#设置画布大小
plt.figure(figsize=(10, 6))
# 绘制直方图
plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# 添加标题和坐标轴标签
plt.title('皮尔逊相关系数分布图')
plt.xlabel('区间')
plt.ylabel('统计数')
plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png'))
plt.close()
#设置画布大小
plt.figure(figsize=(10, 6))
data = correlation_df['Spearman_Correlation'].values.tolist()
# 计算每个区间的统计数(这里是区间内数据的数量)
hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)]
# 绘制直方图
plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0]))
# 添加标题和坐标轴标签
plt.title('斯皮尔曼相关系数分布图')
plt.xlabel('区间')
plt.ylabel('统计数')
plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png'))
plt.close()
content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:'))
# 皮尔逊正相关 不相关 负相关 的表格
content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png')))
content.append(Graphs.draw_text('''皮尔逊相关系数说明:'''))
content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。'''))
content.append(Graphs.draw_text('''
相关系数为1表示两个变量之间存在完全正向的线性关系即当一个变量增加时另一个变量也相应增加且变化是完全一致的。'''))
content.append(Graphs.draw_text('''当前特征中正相关前十的有:'''))
top10 = ','.join(correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature'])
content.append(Graphs.draw_text(f'''{top10}'''))
content.append(Graphs.draw_text('''相关系数为-1表示两个变量之间存在完全负向的线性关系即当一个变量增加时另一个变量会相应减少且变化是完全相反的'''))
content.append(Graphs.draw_text('''当前特征中负相关前十的有:'''))
top10 = ','.join(correlation_df.sort_values(by='Pearson_Correlation',ascending=True).head(10)['Feature'])
content.append(Graphs.draw_text(f'''{top10}'''))
content.append(Graphs.draw_text('''相关系数接近0表示两个变量之间不存在线性关系即它们的变化不会随着对方的变化而变化。'''))
content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:'))
# 皮尔逊正相关 不相关 负相关 的表格
content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png')))
content.append(Graphs.draw_text('斯皮尔曼相关系数Spearmans rank correlation coefficient是一种用于衡量两个变量之间的单调关系不一定是线性关系的统计指标。'))
content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。'))
content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。'))
content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;'))
content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:'''))
top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature'])
content.append(Graphs.draw_text(f'''{top10}'''))
content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;'))
content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:'''))
top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature'])
content.append(Graphs.draw_text(f'''{top10}'''))
content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。'))
content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。'))
content.append(Graphs.draw_little_title('模型选择:'))
content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,模型的简介如下:'))
### 读取模型简介
with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f:
for line in f:
line_split = line.strip().split('--')
# if line_split[0] in fivemodels_list:
for introduction in line_split:
content.append(Graphs.draw_text(introduction))
content.append(Graphs.draw_little_title('模型评估:'))
content.append(Graphs.draw_text(f'通过评估指标MAE从小到大排列前5个模型的评估详情如下'))
df = loadcsv(os.path.join(dataset,'model_evaluation.csv'))
# 判断 df 的数值列转为float
for col in eval_df.columns:
if col not in ['模型(Model)']:
eval_df[col] = eval_df[col].astype(float)
eval_df[col] = eval_df[col].round(3)
# 筛选 fivemodels_list.tolist() 的行
eval_df = eval_df[eval_df['模型(Model)'].isin(fivemodels_list)]
# df转置
eval_df = eval_df.T
# df重置索引
eval_df = eval_df.reset_index()
eval_df = eval_df.T
# # 添加表格
data = eval_df.values.tolist()
col_width = 500/len(eval_df.columns)
content.append(Graphs.draw_table(col_width,*data))
content.append(Graphs.draw_text('评估指标释义:'))
content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值的差值的平方,然后对这些平方差求平均值,最后取平均值的平方根。取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,对预测值与真实值之间差值的绝对值进行求和,然后除以样本数量。取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值之差的平方,然后对这些平方差求平均值。取值越小,误差越小,预测效果越好。'))
content.append(Graphs.draw_text('模型拟合:'))
# 添加图片
content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png')))
### 生成pdf文件
doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter)
doc.build(content)