diff --git a/config_jingbo_zhoudu.py b/config_jingbo_zhoudu.py index b5750ec..ec55b8a 100644 --- a/config_jingbo_zhoudu.py +++ b/config_jingbo_zhoudu.py @@ -2,7 +2,7 @@ import logging import os import logging.handlers import datetime -from lib.tools import MySQLDB,SQLiteHandler +from lib.tools import MySQLDB, SQLiteHandler # eta 接口token @@ -10,66 +10,65 @@ APPID = "XNLDvxZHHugj7wJ7" SECRET = "iSeU4s6cKKBVbt94htVY1p0sqUMqb2xa" # eta 接口url -sourcelisturl = 'http://10.189.2.78:8108/v1/edb/source/list' +sourcelisturl = 'http://10.189.2.78:8108/v1/edb/source/list' classifylisturl = 'http://10.189.2.78:8108/v1/edb/classify/list?ClassifyType=' uniquecodedataurl = 'http://10.189.2.78:8108/v1/edb/data?UniqueCode=4991c37becba464609b409909fe4d992&StartDate=2024-02-01' classifyidlisturl = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=' edbcodedataurl = 'http://10.189.2.78:8108/v1/edb/data?EdbCode=' -edbdatapushurl = 'http://10.189.2.78:8108/v1/edb/push' -edbdeleteurl = 'http://10.189.2.78:8108/v1/edb/business/edb/del' -edbbusinessurl = 'http://10.189.2.78:8108/v1/edb/business/data/del' +edbdatapushurl = 'http://10.189.2.78:8108/v1/edb/push' +edbdeleteurl = 'http://10.189.2.78:8108/v1/edb/business/edb/del' +edbbusinessurl = 'http://10.189.2.78:8108/v1/edb/business/data/del' edbcodelist = ['CO1 Comdty', 'ovx index', 'C2404194834', 'C2404199738', 'dxy curncy', 'C2403128043', 'C2403150124', - 'DOESCRUD Index', 'WTRBM1 EEGC Index', 'FVHCM1 INDEX', 'doedtprd index', 'CFFDQMMN INDEX', - 'C2403083739', 'C2404167878', 'C2403250571', 'lmcads03 lme comdty', 'GC1 COMB Comdty', - 'C2404171822','C2404167855', - # 'W000825','W000826','G.IPE', # 美国汽柴油 - # 'S5131019','ID00135604','FSGAM1 Index','S5120408','ID00136724', # 新加坡汽柴油 - ] + 'DOESCRUD Index', 'WTRBM1 EEGC Index', 'FVHCM1 INDEX', 'doedtprd index', 'CFFDQMMN INDEX', + 'C2403083739', 'C2404167878', 'C2403250571', 'lmcads03 lme comdty', 'GC1 COMB Comdty', + 'C2404171822', 'C2404167855', + # 'W000825','W000826','G.IPE', # 美国汽柴油 + # 'S5131019','ID00135604','FSGAM1 Index','S5120408','ID00136724', # 新加坡汽柴油 + ] # 临时写死用指定的列,与上面的edbcode对应,后面更改 edbnamelist = [ - 'ds','y', - 'Brent c1-c6','Brent c1-c3','Brent-WTI','美国商业原油库存', - 'DFL','美国汽油裂解价差','ovx index','dxy curncy','lmcads03 lme comdty', - 'C2403128043','C2403150124','FVHCM1 INDEX','doedtprd index','CFFDQMMN INDEX', - 'C2403083739','C2404167878', - 'GC1 COMB Comdty','C2404167855', + 'ds', 'y', + 'Brent c1-c6', 'Brent c1-c3', 'Brent-WTI', '美国商业原油库存', + 'DFL', '美国汽油裂解价差', 'ovx index', 'dxy curncy', 'lmcads03 lme comdty', + 'C2403128043', 'C2403150124', 'FVHCM1 INDEX', 'doedtprd index', 'CFFDQMMN INDEX', + 'C2403083739', 'C2404167878', + 'GC1 COMB Comdty', 'C2404167855', # 'A汽油价格','W000826','ICE柴油价格', # '新加坡(含硫0.05%) 柴油现货价','柴油:10ppm:国际市场:FOB中间价:新加坡(日)','Bloomberg Commodity Fair Value Singapore Mogas 92 Swap Month 1','97#汽油FOB新加坡现货价','无铅汽油:97#:国际市场:FOB中间价:新加坡(日)' - ] - +] # eta自有数据指标编码 modelsindex = { - 'NHITS': 'SELF0000001', - 'Informer':'SELF0000057', - 'LSTM':'SELF0000058', - 'iTransformer':'SELF0000059', - 'TSMixer':'SELF0000060', - 'TSMixerx':'SELF0000061', - 'PatchTST':'SELF0000062', - 'RNN':'SELF0000063', - 'GRU':'SELF0000064', - 'TCN':'SELF0000065', - 'BiTCN':'SELF0000066', - 'DilatedRNN':'SELF0000067', - 'MLP':'SELF0000068', - 'DLinear':'SELF0000069', - 'NLinear':'SELF0000070', - 'TFT':'SELF0000071', - 'FEDformer':'SELF0000072', - 'StemGNN':'SELF0000073', - 'MLPMultivariate':'SELF0000074', - 'TiDE':'SELF0000075', - 'DeepNPTS':'SELF0000076' - } + 'NHITS': 'SELF0000001', + 'Informer': 'SELF0000057', + 'LSTM': 'SELF0000058', + 'iTransformer': 'SELF0000059', + 'TSMixer': 'SELF0000060', + 'TSMixerx': 'SELF0000061', + 'PatchTST': 'SELF0000062', + 'RNN': 'SELF0000063', + 'GRU': 'SELF0000064', + 'TCN': 'SELF0000065', + 'BiTCN': 'SELF0000066', + 'DilatedRNN': 'SELF0000067', + 'MLP': 'SELF0000068', + 'DLinear': 'SELF0000069', + 'NLinear': 'SELF0000070', + 'TFT': 'SELF0000071', + 'FEDformer': 'SELF0000072', + 'StemGNN': 'SELF0000073', + 'MLPMultivariate': 'SELF0000074', + 'TiDE': 'SELF0000075', + 'DeepNPTS': 'SELF0000076' +} # eta 上传预测结果的请求体,后面发起请求的时候更改 model datalist 数据 data = { - "IndexCode": "", - "IndexName": "价格预测模型", - "Unit": "无", + "IndexCode": "", + "IndexName": "价格预测模型", + "Unit": "无", "Frequency": "日度", "SourceName": f"价格预测", "Remark": 'ddd', @@ -79,19 +78,18 @@ data = { "Value": 333444 } ] - } +} # eta 分类 # level:3才可以获取到数据,所以需要人工把能源化工下所有的level3级都找到 - # url = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=1214' - #ParentId ":1160, 能源化工 - # ClassifyId ":1214,原油 - #ParentId ":1214,",就是原油下所有的数据。 -ClassifyId = 1214 +# url = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=1214' +# ParentId ":1160, 能源化工 +# ClassifyId ":1214,原油 +# ParentId ":1214,",就是原油下所有的数据。 +ClassifyId = 1214 - -############################################################################################################### 变量定义--测试环境 +# 变量定义--测试环境 server_host = '192.168.100.53' login_pushreport_url = f"http://{server_host}:8080/jingbo-dev/api/server/login" @@ -103,7 +101,7 @@ login_data = { "data": { "account": "api_test", # "password": "MmVmNzNlOWI0MmY0ZDdjZGUwNzE3ZjFiMDJiZDZjZWU=", # Shihua@123456 - "password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=", # 123456 + "password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=", # 123456 "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", "terminal": "API" }, @@ -112,39 +110,39 @@ login_data = { } upload_data = { - "funcModule":'研究报告信息', - "funcOperation":'上传原油价格预测报告', - "data":{ - "ownerAccount":'arui', #报告所属用户账号 - "reportType":'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST - "fileName": '2000-40-5-50--100-原油指标数据.xlsx-Brent活跃合约--2024-09-06-15-01-29-预测报告.pdf', #文件名称 - "fileBase64": '' ,#文件内容base64 - "categoryNo":'yyjgycbg', # 研究报告分类编码 - "smartBusinessClassCode":'YCJGYCBG', #分析报告分类编码 - "reportEmployeeCode":"E40116", # 报告人 - "reportDeptCode" :"D0044" ,# 报告部门 - "productGroupCode":"RAW_MATERIAL" # 商品分类 - } + "funcModule": '研究报告信息', + "funcOperation": '上传原油价格预测报告', + "data": { + "ownerAccount": 'arui', # 报告所属用户账号 + "reportType": 'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST + "fileName": '2000-40-5-50--100-原油指标数据.xlsx-Brent活跃合约--2024-09-06-15-01-29-预测报告.pdf', # 文件名称 + "fileBase64": '', # 文件内容base64 + "categoryNo": 'yyjgycbg', # 研究报告分类编码 + "smartBusinessClassCode": 'YCJGYCBG', # 分析报告分类编码 + "reportEmployeeCode": "E40116", # 报告人 + "reportDeptCode": "D0044", # 报告部门 + "productGroupCode": "RAW_MATERIAL" # 商品分类 + } } warning_data = { - "funcModule":'原油特征停更预警', - "funcOperation":'原油特征停更预警', - "data":{ - 'WARNING_TYPE_NAME':'特征数据停更预警', - 'WARNING_CONTENT':'', - 'WARNING_DATE':'' - } + "funcModule": '原油特征停更预警', + "funcOperation": '原油特征停更预警', + "data": { + 'WARNING_TYPE_NAME': '特征数据停更预警', + 'WARNING_CONTENT': '', + 'WARNING_DATE': '' + } } query_data_list_item_nos_data = { - "funcModule": "数据项", - "funcOperation": "查询", + "funcModule": "数据项", + "funcOperation": "查询", "data": { - "dateStart":"20200101", - "dateEnd":"20241231", - "dataItemNoList":["Brentzdj","Brentzgj"] # 数据项编码,代表 brent最低价和最高价 + "dateStart": "20200101", + "dateEnd": "20241231", + "dataItemNoList": ["Brentzdj", "Brentzgj"] # 数据项编码,代表 brent最低价和最高价 } } @@ -152,96 +150,96 @@ query_data_list_item_nos_data = { # 北京环境数据库 host = '192.168.101.27' port = 3306 -dbusername ='root' +dbusername = 'root' password = '123456' dbname = 'jingbo_test' table_name = 'v_tbl_crude_oil_warning' -### 开关 -is_train = False # 是否训练 -is_debug = False # 是否调试 -is_eta = False # 是否使用eta接口 -is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效 -is_timefurture = True # 是否使用时间特征 -is_fivemodels = False # 是否使用之前保存的最佳的5个模型 -is_edbcode = False # 特征使用edbcoding列表中的 -is_edbnamelist = False # 自定义特征,对应上面的edbnamelist -is_update_eta = False # 预测结果上传到eta -is_update_report = False # 是否上传报告 -is_update_warning_data = False # 是否上传预警数据 -is_del_corr = 0.6 # 是否删除相关性高的特征,取值为 0-1 ,0 为不删除,0.6 表示删除相关性小于0.6的特征 -is_del_tow_month = True # 是否删除两个月不更新的特征 - +# 开关 +is_train = False # 是否训练 +is_debug = False # 是否调试 +is_eta = False # 是否使用eta接口 +is_market = True # 是否通过市场信息平台获取特征 ,在is_eta 为true 的情况下生效 +is_timefurture = True # 是否使用时间特征 +is_fivemodels = False # 是否使用之前保存的最佳的5个模型 +is_edbcode = False # 特征使用edbcoding列表中的 +is_edbnamelist = False # 自定义特征,对应上面的edbnamelist +is_update_eta = False # 预测结果上传到eta +is_update_report = False # 是否上传报告 +is_update_warning_data = False # 是否上传预警数据 +is_del_corr = 0.6 # 是否删除相关性高的特征,取值为 0-1 ,0 为不删除,0.6 表示删除相关性小于0.6的特征 +is_del_tow_month = True # 是否删除两个月不更新的特征 # 连接到数据库 -db_mysql = MySQLDB(host=host, user=dbusername, password=password, database=dbname) +db_mysql = MySQLDB(host=host, user=dbusername, + password=password, database=dbname) db_mysql.connect() -print("数据库连接成功",host,dbname,dbusername) +print("数据库连接成功", host, dbname, dbusername) # 数据截取日期 -start_year = 2015 # 数据开始年份 -end_time = '' # 数据截取日期 +start_year = 2015 # 数据开始年份 +end_time = '' # 数据截取日期 freq = 'WW' # 时间频率,"D": 天 "W": 周"M": 月"Q": 季度"A": 年 "H": 小时 "T": 分钟 "S": 秒 "B": 工作日 "WW" 自定义周 -delweekenday = True if freq == 'B' else False # 是否删除周末数据 -is_corr = False # 特征是否参与滞后领先提升相关系数 -add_kdj = False # 是否添加kdj指标 +delweekenday = True if freq == 'B' else False # 是否删除周末数据 +is_corr = False # 特征是否参与滞后领先提升相关系数 +add_kdj = False # 是否添加kdj指标 if add_kdj and is_edbnamelist: - edbnamelist = edbnamelist+['K','D','J'] + edbnamelist = edbnamelist+['K', 'D', 'J'] -### 模型参数 -y = 'Brent连1合约价格' # 原油指标数据的目标变量 Brent连1合约价格 Brent活跃合约 -horizon =2 # 预测的步长 +# 模型参数 +y = 'Brent连1合约价格' # 原油指标数据的目标变量 Brent连1合约价格 Brent活跃合约 +horizon = 2 # 预测的步长 input_size = 12 # 输入序列长度 -train_steps = 50 if is_debug else 1000 # 训练步数,用来限定epoch次数 +train_steps = 50 if is_debug else 1000 # 训练步数,用来限定epoch次数 val_check_steps = 30 # 评估频率 -early_stop_patience_steps = 5 # 早停的耐心步数 +early_stop_patience_steps = 5 # 早停的耐心步数 # --- 交叉验证用的参数 test_size = 100 # 测试集大小,定义100,后面使用的时候重新赋值 -val_size = test_size # 验证集大小,同测试集大小 +val_size = test_size # 验证集大小,同测试集大小 -### 特征筛选用到的参数 -k = 100 # 特征筛选数量,如果是0或者值比特征数量大,代表全部特征 -corr_threshold = 0.6 # 相关性大于0.6的特征 -rote = 0.06 # 绘图上下界阈值 +# 特征筛选用到的参数 +k = 100 # 特征筛选数量,如果是0或者值比特征数量大,代表全部特征 +corr_threshold = 0.6 # 相关性大于0.6的特征 +rote = 0.06 # 绘图上下界阈值 -### 计算准确率 -weight_dict = [0.4,0.15,0.1,0.1,0.25] # 权重 +# 计算准确率 +weight_dict = [0.4, 0.15, 0.1, 0.1, 0.25] # 权重 -### 文件 -data_set = '原油指标数据.xlsx' # 数据集文件 -dataset = 'yuanyouzhoududataset' # 数据集文件夹 +# 文件 +data_set = '原油指标数据.xlsx' # 数据集文件 +dataset = 'yuanyouzhoududataset' # 数据集文件夹 # 数据库名称 -db_name = os.path.join(dataset,'jbsh_yuanyou_zhoudu.db') -sqlitedb = SQLiteHandler(db_name) +db_name = os.path.join(dataset, 'jbsh_yuanyou_zhoudu.db') +sqlitedb = SQLiteHandler(db_name) sqlitedb.connect() -settings = f'{input_size}-{horizon}-{train_steps}--{k}-{data_set}-{y}' +settings = f'{input_size}-{horizon}-{train_steps}--{k}-{data_set}-{y}' # 获取日期时间 # now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # 获取当前日期时间 -now = datetime.datetime.now().strftime('%Y-%m-%d') # 获取当前日期时间 -reportname = f'Brent原油大模型周度预测--{end_time}.pdf' # 报告文件名 -reportname = reportname.replace(':', '-') # 替换冒号 +now = datetime.datetime.now().strftime('%Y-%m-%d') # 获取当前日期时间 +reportname = f'Brent原油大模型周度预测--{end_time}.pdf' # 报告文件名 +reportname = reportname.replace(':', '-') # 替换冒号 if end_time == '': end_time = now -### 邮件配置 -username='1321340118@qq.com' -passwd='wgczgyhtyyyyjghi' +# 邮件配置 +username = '1321340118@qq.com' +passwd = 'wgczgyhtyyyyjghi' # recv=['liurui_test@163.com','52585119@qq.com'] -recv=['liurui_test@163.com','jin.wang@chambroad.com'] +recv = ['liurui_test@163.com', 'jin.wang@chambroad.com'] # recv=['liurui_test@163.com'] -title='reportname' -content='brent价格预测报告请看附件' -file=os.path.join(dataset,'reportname') +title = 'reportname' +content = 'brent价格预测报告请看附件' +file = os.path.join(dataset, 'reportname') # file=os.path.join(dataset,'14-7-50--100-原油指标数据.xlsx-Brent连1合约价格--20240731175936-预测报告.pdf') -ssl=True +ssl = True -### 日志配置 +# 日志配置 # 创建日志目录(如果不存在) log_dir = 'logs' @@ -253,8 +251,10 @@ logger = logging.getLogger('my_logger') logger.setLevel(logging.INFO) # 配置文件处理器,将日志记录到文件 -file_handler = logging.handlers.RotatingFileHandler(os.path.join(log_dir, 'pricepredict.log'), maxBytes=1024 * 1024, backupCount=5) -file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +file_handler = logging.handlers.RotatingFileHandler(os.path.join( + log_dir, 'pricepredict.log'), maxBytes=1024 * 1024, backupCount=5) +file_handler.setFormatter(logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) # 配置控制台处理器,将日志打印到控制台 console_handler = logging.StreamHandler() @@ -265,4 +265,3 @@ logger.addHandler(file_handler) logger.addHandler(console_handler) # logger.info('当前配置:'+settings) - diff --git a/lib/dataread.py b/lib/dataread.py index 689c5ae..1393310 100644 --- a/lib/dataread.py +++ b/lib/dataread.py @@ -1,5 +1,19 @@ # 导入模块 +from config_jingbo_zhoudu import * +from reportlab.lib.units import cm # 单位:cm +from reportlab.graphics.shapes import Drawing # 绘图工具 +from reportlab.graphics.charts.legends import Legend # 图例类 +from reportlab.graphics.charts.barcharts import VerticalBarChart # 图表类 +from reportlab.lib import colors # 颜色模块 +from reportlab.lib.styles import getSampleStyleSheet # 文本样式 +from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch) +from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类 +from reportlab.pdfbase.ttfonts import TTFont # 字体类 +from reportlab.pdfbase import pdfmetrics # 注册字体 +from sklearn import metrics +from datetime import timedelta +import matplotlib.pyplot as plt import pandas as pd import numpy as np import datetime @@ -8,7 +22,7 @@ import base64 import requests import random import time -import re +import re import os import hmac import hashlib @@ -16,39 +30,26 @@ import json import math import torch torch.set_float32_matmul_precision("high") -import matplotlib.pyplot as plt -#设置plt显示中文 +# 设置plt显示中文 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 -from datetime import timedelta -from sklearn import metrics -from reportlab.pdfbase import pdfmetrics # 注册字体 -from reportlab.pdfbase.ttfonts import TTFont # 字体类 -from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类 -from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch) -from reportlab.lib.styles import getSampleStyleSheet # 文本样式 -from reportlab.lib import colors # 颜色模块 -from reportlab.graphics.charts.barcharts import VerticalBarChart # 图表类 -from reportlab.graphics.charts.legends import Legend # 图例类 -from reportlab.graphics.shapes import Drawing # 绘图工具 -from reportlab.lib.units import cm # 单位:cm # 注册字体(提前准备好字体文件, 如果同一个文件需要多种字体可以注册多个) pdfmetrics.registerFont(TTFont('SimSun', 'SimSun.ttf')) -#设置plt显示中文 +# 设置plt显示中文 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 # from config_jingbo_pro import * # from config_jingbo import * -from config_jingbo_zhoudu import * # from config_jingbo_yuedu import * # from config_yongan import * # from config_juxiting import * # from config_juxiting_zhoudu import * # from config_juxiting_pro import * +# from config_jingbo import logger # 定义函数 @@ -86,9 +87,9 @@ def dateConvert(df, datecol='ds'): """ # 将date列转换为datetime类型 try: - df[datecol] = pd.to_datetime(df[datecol],format=r'%Y-%m-%d') + df[datecol] = pd.to_datetime(df[datecol], format=r'%Y-%m-%d') except: - df[datecol] = pd.to_datetime(df[datecol],format=r'%Y/%m/%d') + df[datecol] = pd.to_datetime(df[datecol], format=r'%Y/%m/%d') return df @@ -101,14 +102,14 @@ def calculate_kdj(data, n=9): # 因为没有高开低价格,利用每日波动率模拟当天最高价和最低价 data['pctchange'] = data['y'].pct_change() # 收益为0的用0.01 - data['pctchange'] = data['pctchange'].replace(0,0.01) + data['pctchange'] = data['pctchange'].replace(0, 0.01) # 去除空值 data.dropna(inplace=True) # 重置索引 - data.reset_index(drop=True,inplace=True) + data.reset_index(drop=True, inplace=True) # 计算最高价和最低价 - data['high'] = data['y']* (1+abs(data['pctchange'])/2) - data['low'] = data['y']* (1-abs(data['pctchange'])/2) + data['high'] = data['y'] * (1+abs(data['pctchange'])/2) + data['low'] = data['y'] * (1-abs(data['pctchange'])/2) # 计算n日内最低价 low_list = data['y'].rolling(window=n, min_periods=1).min() # 计算n日内最高价 @@ -125,7 +126,7 @@ def calculate_kdj(data, n=9): d[i] = (2/3 * d[i - 1]) + (1/3 * k[i]) # 计算j值 j = 3 * k - 2 * d - + # 将k值、d值和j值添加到数据中 data['K'] = k data['D'] = d @@ -147,8 +148,9 @@ def get_head_auth_report(): logger.info("获取token中...") logger.info(f'url:{login_pushreport_url},login_data:{login_data}') # 发送 POST 请求到登录 URL,携带登录数据 - login_res = requests.post(url=login_pushreport_url, json=login_data, timeout=(3, 30)) - + login_res = requests.post(url=login_pushreport_url, + json=login_data, timeout=(3, 30)) + # 将响应内容转换为 JSON 格式 text = json.loads(login_res.text) logger.info(f'token接口响应:{text}') @@ -173,28 +175,29 @@ def upload_report_data(token, upload_data): """ # 直接使用传入的 upload_data upload_data = upload_data - + # 设置请求头部 headers = {"Authorization": token} - + # 打印日志,显示正在上传报告数据 logger.info("报告上传中...") - + # 打印日志,显示认证头部信息 logger.info(f"token:{token}") - + # 打印日志,显示要上传的报告数据 - logger.info(f"upload_data:{upload_data}" ) - + logger.info(f"upload_data:{upload_data}") + # 发送POST请求,上传报告数据 - upload_res = requests.post(url=upload_url, headers=headers, json=upload_data, timeout=(3, 15)) - + upload_res = requests.post( + url=upload_url, headers=headers, json=upload_data, timeout=(3, 15)) + # 将响应内容转换为 JSON 格式 upload_res = json.loads(upload_res.text) - + # 打印日志,显示响应内容 logger.info(upload_res) - + # 如果上传成功,返回响应对象 if upload_res: return upload_res @@ -216,25 +219,26 @@ def upload_warning_data(warning_data): """ # 获取认证头部信息 token = get_head_auth_report() - + # 设置请求头部 headers = {"Authorization": token} - + # 打印日志,显示正在上传预警数据 logger.info("预警上传中...") - + # 打印日志,显示上传的URL logger.info(f"upload_warning_url:{upload_warning_url}") - + # 打印日志,显示认证头部信息 logger.info(f"token:{token}") - + # 打印日志,显示要上传的预警数据 logger.info(f"warning_data:{warning_data}") - + # 发送POST请求,上传预警数据 - upload_res = requests.post(url=upload_warning_url, headers=headers, json=warning_data, timeout=(3, 15)) - + upload_res = requests.post( + url=upload_warning_url, headers=headers, json=warning_data, timeout=(3, 15)) + # 如果上传成功,返回响应对象 if upload_res: return upload_res @@ -243,7 +247,6 @@ def upload_warning_data(warning_data): logger.info("预警上传失败") return None - def upload_warning_info(df_count): """ @@ -257,29 +260,28 @@ def upload_warning_info(df_count): """ # 打印日志,显示正在上传预警信息 logger.info(f'上传预警信息') - + try: # 获取当前日期 - warning_date = datetime.datetime.now().strftime('%Y-%m-%d') - warning_date2 = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - + warning_date = datetime.datetime.now().strftime('%Y-%m-%d') + warning_date2 = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + # 构建预警内容 content = f'{warning_date}有{df_count}个停更' - + # 更新预警数据中的日期和内容 warning_data['data']['WARNING_DATE'] = warning_date2 - warning_data['data']['WARNING_CONTENT'] = content - + warning_data['data']['WARNING_CONTENT'] = content + # 调用 upload_warning_data 函数上传预警数据 upload_warning_data(warning_data) - + # 打印日志,显示上传预警信息成功 logger.info(f'上传预警信息成功') except Exception as e: # 打印日志,显示上传预警信息失败,并记录异常信息 logger.error(f'上传预警信息失败:{e}') - def create_feature_last_update_time(df): """ @@ -293,49 +295,57 @@ def create_feature_last_update_time(df): df1 = df.copy() # 找到每列的最后更新时间 df1.set_index('ds', inplace=True) - last_update_times = df1.apply(lambda x: x.dropna().index.max().strftime('%Y-%m-%d') if not x.dropna().empty else None) - + last_update_times = df1.apply(lambda x: x.dropna().index.max().strftime( + '%Y-%m-%d') if not x.dropna().empty else None) + # 保存每列的最后更新时间到文件 - last_update_times_df = pd.DataFrame(columns = ['feature', 'last_update_time','is_value','update_period','warning_date','stop_update_period']) - + last_update_times_df = pd.DataFrame(columns=[ + 'feature', 'last_update_time', 'is_value', 'update_period', 'warning_date', 'stop_update_period']) + # 打印每列的最后更新时间 for column, last_update_time in last_update_times.items(): values = [] # 判断是不是常数值 if df1[column].tail(20).nunique() == 1: - values = values + [column, last_update_time,1] + values = values + [column, last_update_time, 1] else: - values = values + [column, last_update_time,0] + values = values + [column, last_update_time, 0] # 计算特征数据值的时间差 try: # 计算预警日期 - time_diff = (df1[column].dropna().index.to_series().diff().mode()[0]).total_seconds() / 3600 / 24 - last_update_time_datetime = datetime.datetime.strptime(last_update_time, '%Y-%m-%d') + time_diff = (df1[column].dropna().index.to_series().diff().mode()[ + 0]).total_seconds() / 3600 / 24 + last_update_time_datetime = datetime.datetime.strptime( + last_update_time, '%Y-%m-%d') last_update_date = end_time if end_time != '' else datetime.datetime.now().strftime('%Y-%m-%d') - end_time_datetime = datetime.datetime.strptime(last_update_date, '%Y-%m-%d') - early_warning_date = last_update_time_datetime + timedelta(days=time_diff)*2 + timedelta(days=1) - stop_update_period = int(math.ceil((end_time_datetime-last_update_time_datetime).days / time_diff)) + end_time_datetime = datetime.datetime.strptime( + last_update_date, '%Y-%m-%d') + early_warning_date = last_update_time_datetime + \ + timedelta(days=time_diff)*2 + timedelta(days=1) + stop_update_period = int( + math.ceil((end_time_datetime-last_update_time_datetime).days / time_diff)) early_warning_date = early_warning_date.strftime('%Y-%m-%d') except KeyError: time_diff = 0 early_warning_date = end_time continue - values = values + [time_diff,early_warning_date,stop_update_period] + values = values + [time_diff, early_warning_date, stop_update_period] last_update_times_df.loc[len(last_update_times_df)] = values logger.info(f"Column {column} was last updated at {last_update_time}") - y_last_update_time = last_update_times_df[last_update_times_df['feature']=='y']['warning_date'].values[0] - last_update_times_df.to_csv(os.path.join(dataset,'last_update_times.csv'), index=False) + y_last_update_time = last_update_times_df[last_update_times_df['feature'] + == 'y']['warning_date'].values[0] + last_update_times_df.to_csv(os.path.join( + dataset, 'last_update_times.csv'), index=False) logger.info('特征停更信息保存到文件:last_update_times.csv') - return last_update_times_df,y_last_update_time - + return last_update_times_df, y_last_update_time # 统计特征频度 def featurePindu(dataset): # 读取文件 - df = loadcsv(os.path.join(dataset,'未填充的特征数据.csv')) + df = loadcsv(os.path.join(dataset, '未填充的特征数据.csv')) df['ds'] = pd.to_datetime(df['ds']) # 按ds正序排序,重置索引 df = df.sort_values(by='ds', ascending=True).reset_index(drop=True) @@ -347,55 +357,56 @@ def featurePindu(dataset): count_dict = {} for column in columns: # 获取每列时间间隔 - values = df[[column,'ds']] - values.dropna(inplace=True,axis=0) - values=values.reset_index(drop=True) + values = df[[column, 'ds']] + values.dropna(inplace=True, axis=0) + values = values.reset_index(drop=True) # 抽取20%个值 value = values.sample(frac=0.2) index = value.index next_index = index + 1 count = [] - for i,j in zip(index, next_index): - #通过索引计算日期差 + for i, j in zip(index, next_index): + # 通过索引计算日期差 try: - count.append((values.loc[j,'ds'] - values.loc[i,'ds']).days) + count.append((values.loc[j, 'ds'] - values.loc[i, 'ds']).days) except: pass - # 把31 换成 30 + # 把31 换成 30 count = [30 if i == 31 else i for i in count] # 保留count中出现次数最多的数 try: count = max(set(count), key=count.count) - except ValueError : + except ValueError: logger.info(f'{column}列数据为空') continue # 存储到字典中 count_dict[column] = count - - df = pd.DataFrame(count_dict,index=['count']).T + + df = pd.DataFrame(count_dict, index=['count']).T pindu_dfs = pd.DataFrame() # 根据count分组 # 输出特征频度统计 - pindudict = {'1':'日度','3':'日度','7':'周度','30':'月度','90':'季度','180':'半年度','365':'年度'} + pindudict = {'1': '日度', '3': '日度', '7': '周度', + '30': '月度', '90': '季度', '180': '半年度', '365': '年度'} for i in df.groupby('count'): # 获取 i[1] 的索引值 index = i[1].index pindu_df = pd.DataFrame() try: pindu_df[pindudict[str(i[0])]+f'({len(i[1])})'] = index - except KeyError : + except KeyError: pindu_df[str(i[0])+f'天({len(i[1])})'] = index # 合并到pindu_dfs - pindu_dfs = pd.concat([pindu_dfs,pindu_df],axis=1) + pindu_dfs = pd.concat([pindu_dfs, pindu_df], axis=1) # nan替换为 ' ' pindu_dfs = pindu_dfs.fillna('') - pindu_dfs.to_csv(os.path.join(dataset,'特征频度统计.csv'),index=False) + pindu_dfs.to_csv(os.path.join(dataset, '特征频度统计.csv'), index=False) logger.info(pindu_dfs) featureInfo = f'特征信息:总共有{len(columns)-2}个' for i in pindu_dfs.columns: featureInfo += f',{i}' - + featureInfo += ', 详看 附1、特征列表' featureInfo += ''' @@ -409,12 +420,12 @@ def featurePindu(dataset): 数据特征相关性分析: ''' logger.info(featureInfo) - with open(os.path.join(dataset,'特征频度统计.txt'), 'w', encoding='utf-8') as f: + with open(os.path.join(dataset, '特征频度统计.txt'), 'w', encoding='utf-8') as f: f.write(featureInfo) logger.info('*'*200) -def featureAnalysis(df,dataset,y): +def featureAnalysis(df, dataset, y): # 特征筛选 import matplotlib.pyplot as plt # 选择特征和标签列 @@ -424,13 +435,13 @@ def featureAnalysis(df,dataset,y): # 标签集自相关函数分析 from statsmodels.graphics.tsaplots import plot_acf plot_acf(yy, lags=30) - plt.savefig(os.path.join(dataset,'指标数据自相关图.png')) + plt.savefig(os.path.join(dataset, '指标数据自相关图.png')) plt.close() # 标签集偏自相关函数分析 from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(yy, lags=30) - plt.savefig(os.path.join(dataset,'指标数据偏自相关图.png')) + plt.savefig(os.path.join(dataset, '指标数据偏自相关图.png')) plt.close() # 画 特征与价格散点图 @@ -439,7 +450,7 @@ def featureAnalysis(df,dataset,y): if file.endswith("散点图.png"): os.remove(os.path.join(dataset, file)) plt.rcParams['font.sans-serif'] = ['SimHei'] - plt.rcParams['axes.unicode_minus'] = False + plt.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(10, 10)) # # 遍历X每一列,和yy画散点图 , # for i, col in enumerate(X.columns): @@ -453,7 +464,6 @@ def featureAnalysis(df,dataset,y): # plt.savefig(os.path.join(dataset,f'{i}指标数据特征与价格散点图.png')) # plt.close() - def corr_feature(df): # 重新命名列名,列名排序,y在第一个 @@ -467,19 +477,20 @@ def corr_feature(df): df_test_noscaler = df_test.copy() # 滞后处理备份 df_noscaler = df_test.copy() # 画出相关性热力图 - df_test.to_csv(os.path.join(dataset,'同步相关性.csv')) + df_test.to_csv(os.path.join(dataset, '同步相关性.csv')) corr = df_test.corr() # 保存相关系数 - corr.to_csv(os.path.join(dataset,'同步相关性系数.csv')) + corr.to_csv(os.path.join(dataset, '同步相关性系数.csv')) # plt.figure(figsize=(10, 10)) # sns.heatmap(corr, annot=True, cmap='coolwarm') # plt.savefig('dataset/同步相关性热力图.png') # plt.show() # 读取滞后周期文件,更改特征 - characteristic_period = pd.read_csv('dataset/特征滞后周期.csv',encoding='utf-8') + characteristic_period = pd.read_csv('dataset/特征滞后周期.csv', encoding='utf-8') # 去掉周期为0的行 - characteristic_period = characteristic_period.drop(characteristic_period[characteristic_period['滞后周期'] == 0].index) + characteristic_period = characteristic_period.drop( + characteristic_period[characteristic_period['滞后周期'] == 0].index) for col in df.columns: # 跳过y列 if col in ['y']: @@ -487,11 +498,11 @@ def corr_feature(df): # 特征滞后n个周期,计算与y的相关性 if col in characteristic_period['特征'].values: # 获取特征对应的周期 - period = characteristic_period[characteristic_period['特征'] == col]['滞后周期'].values[0] + period = characteristic_period[characteristic_period['特征'] + == col]['滞后周期'].values[0] # 滞后处理 df[col] = df[col].shift(period) - df.to_csv(os.path.join(dataset,'滞后处理后的数据集.csv')) - + df.to_csv(os.path.join(dataset, '滞后处理后的数据集.csv')) # corr_feture_noscaler = {} # 保存相关性最大的周期 # 遍历df_test的每一列,计算相关性 @@ -500,35 +511,35 @@ def corr_feature(df): # if col in ['y']: # continue # logger.info('特征:', col) - # # 特征滞后n个周期,计算与y的相关性 - # corr_dict = {} - # try: - # for i in range(0, 200): - # if i == 0: - # df_noscaler[col+'_'+str(i)] = df_noscaler[col] - # else: - # df_noscaler[col+'_'+str(i)] = df_noscaler[col].shift(i) - # corr_dict[col+'_'+str(i)] = abs(df_noscaler[col+'_'+str(i)].corr(df_noscaler['y'])) - # except : - # logger.info('特征:', col, '滑动错误,请查看') - # continue - # 输出相关性最大的特征 - # logger.info(max(corr_dict, key=corr_dict.get), corr_dict[max(corr_dict, key=corr_dict.get)]) - # corr_feture_noscaler[col] = max(corr_dict, key=corr_dict.get).split('_')[-1] - # 画出最相关性最大的特征和y的折线图 - # plt.figure(figsize=(10, 5)) - # plt.plot(df_noscaler[max(corr_dict, key=corr_dict.get)], label=max(corr_dict, key=corr_dict.get)) - # # 设置双坐标轴 - # ax1 = plt.gca() - # ax2 = ax1.twinx() - # ax2.plot(df_noscaler['y'], color='r', label='y') - # plt.legend() - # try: - # plt.savefig('dataset/特征与y的折线图_'+max(corr_dict, key=corr_dict.get)+'.png') - # except : - # # :替换成_ - # plt.savefig('dataset/特征与y的折线图_'+max(corr_dict, key=corr_dict.get).replace(':','_').replace('/','_').replace('(','_').replace(')','_')+'.png') - # plt.close() + # # 特征滞后n个周期,计算与y的相关性 + # corr_dict = {} + # try: + # for i in range(0, 200): + # if i == 0: + # df_noscaler[col+'_'+str(i)] = df_noscaler[col] + # else: + # df_noscaler[col+'_'+str(i)] = df_noscaler[col].shift(i) + # corr_dict[col+'_'+str(i)] = abs(df_noscaler[col+'_'+str(i)].corr(df_noscaler['y'])) + # except : + # logger.info('特征:', col, '滑动错误,请查看') + # continue + # 输出相关性最大的特征 + # logger.info(max(corr_dict, key=corr_dict.get), corr_dict[max(corr_dict, key=corr_dict.get)]) + # corr_feture_noscaler[col] = max(corr_dict, key=corr_dict.get).split('_')[-1] + # 画出最相关性最大的特征和y的折线图 + # plt.figure(figsize=(10, 5)) + # plt.plot(df_noscaler[max(corr_dict, key=corr_dict.get)], label=max(corr_dict, key=corr_dict.get)) + # # 设置双坐标轴 + # ax1 = plt.gca() + # ax2 = ax1.twinx() + # ax2.plot(df_noscaler['y'], color='r', label='y') + # plt.legend() + # try: + # plt.savefig('dataset/特征与y的折线图_'+max(corr_dict, key=corr_dict.get)+'.png') + # except : + # # :替换成_ + # plt.savefig('dataset/特征与y的折线图_'+max(corr_dict, key=corr_dict.get).replace(':','_').replace('/','_').replace('(','_').replace(')','_')+'.png') + # plt.close() # 结果保存到txt文件 # logger.info('不参与标准化的特征滞后相关性写入txt文件') # with open('dataset/不参与标准化的特征滞后相关性.txt', 'w') as f: @@ -581,7 +592,6 @@ def corr_feature(df): # # 输出相关性最大的特征 # logger.info(max(corr_dict, key=corr_dict.get), corr_dict[max(corr_dict, key=corr_dict.get)]) # corr_feture[col] = max(corr_dict, key=corr_dict.get).split('_')[-1] - # # 结果保存到txt文件 # with open('dataset/标准化的特征滞后相关性.txt', 'w') as f: @@ -625,12 +635,12 @@ def calculate_kdj(data, n=9): # 因为没有高开低价格,利用每日波动率模拟当天最高价和最低价 data['pctchange'] = data['y'].pct_change() # 收益为0的用0.01 - data['pctchange'] = data['pctchange'].replace(0,0.01) + data['pctchange'] = data['pctchange'].replace(0, 0.01) data.dropna(inplace=True) # 重置索引 - data.reset_index(drop=True,inplace=True) - data['high'] = data['y']* (1+abs(data['pctchange'])/2) - data['low'] = data['y']* (1-abs(data['pctchange'])/2) + data.reset_index(drop=True, inplace=True) + data['high'] = data['y'] * (1+abs(data['pctchange'])/2) + data['low'] = data['y'] * (1-abs(data['pctchange'])/2) low_list = data['y'].rolling(window=n, min_periods=1).min() high_list = data['y'].rolling(window=n, min_periods=1).max() rsv = ((data['y'] - low_list) / (high_list - low_list)) * 100 @@ -640,7 +650,7 @@ def calculate_kdj(data, n=9): k[i] = (2/3 * k[i - 1]) + (1/3 * rsv[i]) d[i] = (2/3 * d[i - 1]) + (1/3 * k[i]) j = 3 * k - 2 * d - + data['K'] = k data['D'] = d data['J'] = j @@ -649,39 +659,41 @@ def calculate_kdj(data, n=9): # data = data.dropna() return data -def check_column(df,col_name,two_months_ago): - ''' - 检查列是否需要删除。 - 该函数会检查列是否为空值列、180天没有更新的列或常数值列。 - 参数: - col_name (str): 列名。 - df (DataFrame): 包含列的 DataFrame。 - 返回: - bool: 如果列需要删除,返回 True;否则,返回 False。 - ''' - if 'ds' in col_name or 'y' in col_name: - return False - df_check_column = df[['ds',col_name,'y']] - df_check_column = df_check_column.dropna() - if len(df_check_column) == 0: - print(f'空值列:{col_name}') - return True - # 判断是不是常数列 - if df_check_column[(df_check_column['ds']>= two_months_ago)].groupby(col_name).ngroups < 2: - print(f'180没有更新:{col_name}') - return True - - # 判断相关系数大于0.6 - if is_del_corr > 0: - if abs(df_check_column[col_name].corr(df_check_column['y'])) < is_del_corr: - print(f'相关系数小于0.6:{col_name}') - return True - - corresponding_date = df_check_column.iloc[-1]['ds'] - return corresponding_date < two_months_ago +def check_column(df, col_name, two_months_ago): + ''' + 检查列是否需要删除。 + 该函数会检查列是否为空值列、180天没有更新的列或常数值列。 + 参数: + col_name (str): 列名。 + df (DataFrame): 包含列的 DataFrame。 + 返回: + bool: 如果列需要删除,返回 True;否则,返回 False。 + ''' + if 'ds' in col_name or 'y' in col_name: + return False + df_check_column = df[['ds', col_name, 'y']] + df_check_column = df_check_column.dropna() -def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y',dataset='dataset',delweekenday=False,add_kdj=False,is_timefurture=False): + if len(df_check_column) == 0: + print(f'空值列:{col_name}') + return True + # 判断是不是常数列 + if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2: + print(f'180没有更新:{col_name}') + return True + + # 判断相关系数大于0.6 + if is_del_corr > 0: + if abs(df_check_column[col_name].corr(df_check_column['y'])) < is_del_corr: + print(f'相关系数小于0.6:{col_name}') + return True + + corresponding_date = df_check_column.iloc[-1]['ds'] + return corresponding_date < two_months_ago + + +def datachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y='y', dataset='dataset', delweekenday=False, add_kdj=False, is_timefurture=False): ''' 原油特征数据处理函数, 接收的是两个df,一个是指标数据,一个是指标列表 @@ -692,12 +704,12 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y' if end_time == '': end_time = datetime.datetime.now().strftime('%Y-%m-%d') # 重命名时间列,预测列 - df.rename(columns={datecol:'ds'},inplace=True) - df.rename(columns={y:'y'},inplace=True) + df.rename(columns={datecol: 'ds'}, inplace=True) + df.rename(columns={y: 'y'}, inplace=True) # 按时间顺序排列 - df.sort_values(by='ds',inplace=True) + df.sort_values(by='ds', inplace=True) df['ds'] = pd.to_datetime(df['ds']) - # 获取start_year年到end_time的数据 + # 获取start_year年到end_time的数据 df = df[df['ds'].dt.year >= start_year] df = df[df['ds'] <= end_time] # last_update_times_df,y_last_update_time = create_feature_last_update_time(df) @@ -714,16 +726,16 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y' logger.info(f'删除两月不更新特征前数据量:{df.shape}') columns_to_drop = [] for clo in df.columns: - if check_column(df,clo,two_months_ago): + if check_column(df, clo, two_months_ago): columns_to_drop.append(clo) df = df.drop(columns=columns_to_drop) logger.info(f'删除两月不更新特征后数据量:{df.shape}') - + # 衍生时间特征 if is_timefurture: - df = addtimecharacteristics(df=df,dataset=dataset) - + df = addtimecharacteristics(df=df, dataset=dataset) + if freq == 'WW': # 自定义周数据 # 按weekofmothe分组取均值得到新的数据 @@ -742,10 +754,12 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y' logger.info(f'删除预测列为空值的行后数据量:{df.shape}') df = df.dropna(axis=1, how='all') logger.info(f'删除全为空值的列后数据量:{df.shape}') - df.to_csv(os.path.join(dataset,'未填充的特征数据.csv'),index=False) + df.to_csv(os.path.join(dataset, '未填充的特征数据.csv'), index=False) # 去掉指标列表中的columns_to_drop的行 - df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin(df.columns.tolist())] - df_zhibiaoliebiao.to_csv(os.path.join(dataset,'特征处理后的指标名称及分类.csv'),index=False) + df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin( + df.columns.tolist())] + df_zhibiaoliebiao.to_csv(os.path.join( + dataset, '特征处理后的指标名称及分类.csv'), index=False) # 数据频度分析 featurePindu(dataset=dataset) # 向上填充 @@ -756,16 +770,17 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y' # 删除周六日的数据 if delweekenday: df = df[df['ds'].dt.weekday < 5] - + # kdj指标 if add_kdj: df = calculate_kdj(df) - + # 特征分析 - featureAnalysis(df,dataset=dataset,y=y) + featureAnalysis(df, dataset=dataset, y=y) return df -def zhoududatachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y',dataset='dataset',delweekenday=False,add_kdj=False,is_timefurture=False): + +def zhoududatachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y='y', dataset='dataset', delweekenday=False, add_kdj=False, is_timefurture=False): ''' 原油特征周度数据处理函数, 接收的是两个df,一个是指标数据,一个是指标列表 @@ -776,12 +791,12 @@ def zhoududatachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='' if end_time == '': end_time = datetime.datetime.now().strftime('%Y-%m-%d') # 重命名时间列,预测列 - df.rename(columns={datecol:'ds'},inplace=True) - df.rename(columns={y:'y'},inplace=True) + df.rename(columns={datecol: 'ds'}, inplace=True) + df.rename(columns={y: 'y'}, inplace=True) # 按时间顺序排列 - df.sort_values(by='ds',inplace=True) + df.sort_values(by='ds', inplace=True) df['ds'] = pd.to_datetime(df['ds']) - # 获取start_year年到end_time的数据 + # 获取start_year年到end_time的数据 df = df[df['ds'].dt.year >= start_year] df = df[df['ds'] <= end_time] # last_update_times_df,y_last_update_time = create_feature_last_update_time(df) @@ -798,12 +813,12 @@ def zhoududatachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='' logger.info(f'删除两月不更新特征前数据量:{df.shape}') columns_to_drop = [] for clo in df.columns: - if check_column(df,clo,two_months_ago): + if check_column(df, clo, two_months_ago): columns_to_drop.append(clo) df = df.drop(columns=columns_to_drop) logger.info(f'删除两月不更新特征后数据量:{df.shape}') - + if freq == 'W': # 按周取样 df = df.resample('W', on='ds').mean().reset_index() @@ -816,10 +831,12 @@ def zhoududatachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='' logger.info(f'删除预测列为空值的行后数据量:{df.shape}') df = df.dropna(axis=1, how='all') logger.info(f'删除全为空值的列后数据量:{df.shape}') - df.to_csv(os.path.join(dataset,'未填充的特征数据.csv'),index=False) + df.to_csv(os.path.join(dataset, '未填充的特征数据.csv'), index=False) # 去掉指标列表中的columns_to_drop的行 - df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin(df.columns.tolist())] - df_zhibiaoliebiao.to_csv(os.path.join(dataset,'特征处理后的指标名称及分类.csv'),index=False) + df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin( + df.columns.tolist())] + df_zhibiaoliebiao.to_csv(os.path.join( + dataset, '特征处理后的指标名称及分类.csv'), index=False) # 数据频度分析 featurePindu(dataset=dataset) # 向上填充 @@ -830,21 +847,19 @@ def zhoududatachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='' # 删除周六日的数据 if delweekenday: df = df[df['ds'].dt.weekday < 5] - + # kdj指标 if add_kdj: df = calculate_kdj(df) # 衍生时间特征 if is_timefurture: - df = addtimecharacteristics(df=df,dataset=dataset) + df = addtimecharacteristics(df=df, dataset=dataset) # 特征分析 - featureAnalysis(df,dataset=dataset,y=y) + featureAnalysis(df, dataset=dataset, y=y) return df - - -def datachuli_juxiting(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y',dataset='dataset',delweekenday=False,add_kdj=False,is_timefurture=False): +def datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y='y', dataset='dataset', delweekenday=False, add_kdj=False, is_timefurture=False): ''' 聚烯烃特征数据处理函数, 接收的是两个df,一个是指标数据,一个是指标列表 @@ -854,8 +869,8 @@ def datachuli_juxiting(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time if end_time == '': end_time = datetime.datetime.now().strftime('%Y-%m-%d') # date转为pddate - df.rename(columns={datecol:'ds'},inplace=True) - + df.rename(columns={datecol: 'ds'}, inplace=True) + # 指定列统一减少数值 df[offsite_col] = df[offsite_col]-offsite # 预测列为avg_cols的均值 @@ -864,11 +879,11 @@ def datachuli_juxiting(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time df = df.drop(columns=avg_cols) # 重命名预测列 - df.rename(columns={y:'y'},inplace=True) + df.rename(columns={y: 'y'}, inplace=True) # 按时间顺序排列 - df.sort_values(by='ds',inplace=True) + df.sort_values(by='ds', inplace=True) df['ds'] = pd.to_datetime(df['ds']) - # 获取2018年到当前日期的数据 + # 获取2018年到当前日期的数据 df = df[df['ds'].dt.year >= 2018] # 获取小于等于当前日期的数据 df = df[df['ds'] <= end_time] @@ -876,32 +891,35 @@ def datachuli_juxiting(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time # 去掉近最后数据对应的日期在两月以前的列,删除近2月的数据是常数的列 current_date = datetime.datetime.now() two_months_ago = current_date - timedelta(days=40) - # 检查两月不更新的特征 + # 检查两月不更新的特征 + def check_column(col_name): if 'ds' in col_name or 'y' in col_name: return False - df_check_column = df[['ds',col_name]] + df_check_column = df[['ds', col_name]] df_check_column = df_check_column.dropna() if len(df_check_column) == 0: return True - if df_check_column[(df_check_column['ds']>= two_months_ago)].groupby(col_name).ngroups < 2: + if df_check_column[(df_check_column['ds'] >= two_months_ago)].groupby(col_name).ngroups < 2: return True corresponding_date = df_check_column.iloc[-1]['ds'] return corresponding_date < two_months_ago columns_to_drop = df.columns[df.columns.map(check_column)].tolist() - df = df.drop(columns = columns_to_drop) - + df = df.drop(columns=columns_to_drop) + logger.info(f'删除两月不更新特征后数据量:{df.shape}') - + # 删除预测列空值的行 df = df.dropna(subset=['y']) logger.info(f'删除预测列为空值的行后数据量:{df.shape}') df = df.dropna(axis=1, how='all') logger.info(f'删除全为空值的列后数据量:{df.shape}') - df.to_csv(os.path.join(dataset,'未填充的特征数据.csv'),index=False) + df.to_csv(os.path.join(dataset, '未填充的特征数据.csv'), index=False) # 去掉指标列表中的columns_to_drop的行 - df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin(df.columns.tolist())] - df_zhibiaoliebiao.to_csv(os.path.join(dataset,'特征处理后的指标名称及分类.csv'),index=False) + df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin( + df.columns.tolist())] + df_zhibiaoliebiao.to_csv(os.path.join( + dataset, '特征处理后的指标名称及分类.csv'), index=False) # 频度分析 featurePindu(dataset=dataset) # 向上填充 @@ -912,80 +930,83 @@ def datachuli_juxiting(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time # 删除周六日的数据 if delweekenday: df = df[df['ds'].dt.weekday < 5] - + if add_kdj: df = calculate_kdj(df) - - if is_timefurture: - df = addtimecharacteristics(df=df,dataset=dataset) - featureAnalysis(df,dataset=dataset,y=y) + if is_timefurture: + df = addtimecharacteristics(df=df, dataset=dataset) + + featureAnalysis(df, dataset=dataset, y=y) return df -def getdata(filename, datecol='date',y='y',dataset='',add_kdj=False,is_timefurture=False,end_time=''): + +def getdata(filename, datecol='date', y='y', dataset='', add_kdj=False, is_timefurture=False, end_time=''): logger.info('getdata接收:'+filename+' '+datecol+' '+end_time) # 判断后缀名 csv或excel if filename.endswith('.csv'): df = loadcsv(filename) else: # 读取excel 指标数据 - df_zhibiaoshuju = pd.read_excel(filename,sheet_name='指标数据') - df_zhibiaoliebiao = pd.read_excel(filename,sheet_name='指标列表') + df_zhibiaoshuju = pd.read_excel(filename, sheet_name='指标数据') + df_zhibiaoliebiao = pd.read_excel(filename, sheet_name='指标列表') - # 日期字符串转为datatime - df = datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol,y = y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) + df = datachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol, y=y, dataset=dataset, + add_kdj=add_kdj, is_timefurture=is_timefurture, end_time=end_time) - return df,df_zhibiaoliebiao + return df, df_zhibiaoliebiao -def getzhoududata(filename, datecol='date',y='y',dataset='',add_kdj=False,is_timefurture=False,end_time=''): + +def getzhoududata(filename, datecol='date', y='y', dataset='', add_kdj=False, is_timefurture=False, end_time=''): logger.info('getdata接收:'+filename+' '+datecol+' '+end_time) # 判断后缀名 csv或excel if filename.endswith('.csv'): df = loadcsv(filename) else: # 读取excel 指标数据 - df_zhibiaoshuju = pd.read_excel(filename,sheet_name='指标数据') - df_zhibiaoliebiao = pd.read_excel(filename,sheet_name='指标列表') + df_zhibiaoshuju = pd.read_excel(filename, sheet_name='指标数据') + df_zhibiaoliebiao = pd.read_excel(filename, sheet_name='指标列表') - # 日期字符串转为datatime - df = zhoududatachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol,y = y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) + df = zhoududatachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol, y=y, dataset=dataset, + add_kdj=add_kdj, is_timefurture=is_timefurture, end_time=end_time) - return df,df_zhibiaoliebiao + return df, df_zhibiaoliebiao - - -def getdata_juxiting(filename, datecol='date',y='y',dataset='',add_kdj=False,is_timefurture=False,end_time=''): +def getdata_juxiting(filename, datecol='date', y='y', dataset='', add_kdj=False, is_timefurture=False, end_time=''): logger.info('getdata接收:'+filename+' '+datecol+' '+end_time) # 判断后缀名 csv或excel if filename.endswith('.csv'): df = loadcsv(filename) else: # 读取excel 指标数据 - df_zhibiaoshuju = pd.read_excel(filename,sheet_name='指标数据') - df_zhibiaoliebiao = pd.read_excel(filename,sheet_name='指标列表') - + df_zhibiaoshuju = pd.read_excel(filename, sheet_name='指标数据') + df_zhibiaoliebiao = pd.read_excel(filename, sheet_name='指标列表') + # 日期字符串转为datatime - df = datachuli_juxiting(df_zhibiaoshuju,df_zhibiaoliebiao,datecol,y = y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) + df = datachuli_juxiting(df_zhibiaoshuju, df_zhibiaoliebiao, datecol, y=y, dataset=dataset, + add_kdj=add_kdj, is_timefurture=is_timefurture, end_time=end_time) - return df,df_zhibiaoliebiao + return df, df_zhibiaoliebiao -def sanitize_filename(filename): - # 使用正则表达式替换不合规的字符 - # 这里我们替换为下划线'_',但你可以根据需要选择其他字符 - sanitized = re.sub(r'[\\/*?:"<>|\s]', '_', filename) - # 移除开头的点(在某些系统中,以点开头的文件可能是隐藏的) - sanitized = re.sub(r'^\.', '', sanitized) - # 如果需要,可以添加更多替换规则 - return sanitized +def sanitize_filename(filename): + # 使用正则表达式替换不合规的字符 + # 这里我们替换为下划线'_',但你可以根据需要选择其他字符 + sanitized = re.sub(r'[\\/*?:"<>|\s]', '_', filename) + # 移除开头的点(在某些系统中,以点开头的文件可能是隐藏的) + sanitized = re.sub(r'^\.', '', sanitized) + # 如果需要,可以添加更多替换规则 + return sanitized + class BinanceAPI: ''' 获取 Binance API 请求头签名 ''' + def __init__(self, APPID, SECRET): self.APPID = APPID self.SECRET = SECRET @@ -993,7 +1014,8 @@ class BinanceAPI: # 生成随机字符串作为 nonce def generate_nonce(self, length=32): - self.nonce = ''.join(random.choices(string.ascii_letters + string.digits, k=length)) + self.nonce = ''.join(random.choices( + string.ascii_letters + string.digits, k=length)) return self.nonce # 获取当前时间戳(秒) @@ -1016,6 +1038,7 @@ class BinanceAPI: self.signature = self.calculate_signature(self.SECRET, self.sign_str) # return self.signature + class Graphs: # 绘制标题 @staticmethod @@ -1033,7 +1056,7 @@ class Graphs: ct.bold = True # 创建标题对应的段落,并且返回 return Paragraph(title, ct) - + # 绘制小标题 @staticmethod def draw_little_title(title: str): @@ -1129,16 +1152,17 @@ class Graphs: return img # 定义样式函数 + + def style_row(row): if '周' in row['频度']: - return ['background-color: yellow'] * len(row) + return ['background-color: yellow'] * len(row) else: - return ['background-color: gray'] * len(row) - + return ['background-color: gray'] * len(row) class EtaReader(): - def __init__(self,signature,classifylisturl,classifyidlisturl,edbcodedataurl,edbcodelist,edbdatapushurl,edbdeleteurl,edbbusinessurl): + def __init__(self, signature, classifylisturl, classifyidlisturl, edbcodedataurl, edbcodelist, edbdatapushurl, edbdeleteurl, edbbusinessurl): ''' 初始化 EtaReader 类的实例。 @@ -1158,40 +1182,39 @@ class EtaReader(): self.signature = signature self.classifylisturl = classifylisturl self.classifyidlisturl = classifyidlisturl - self.edbcodedataurl = edbcodedataurl + self.edbcodedataurl = edbcodedataurl self.edbdatapushurl = edbdatapushurl - self.edbcodelist = edbcodelist - self.edbdeleteurl = edbdeleteurl + self.edbcodelist = edbcodelist + self.edbdeleteurl = edbdeleteurl self.edbbusinessurl = edbbusinessurl - - def filter_yuanyou_data(self,ClassifyName,data): + def filter_yuanyou_data(self, ClassifyName, data): ''' 指标名称保留规则 ''' - + # 包含 关键词 去除, 返回flase - if any(keyword in data for keyword in ['运费','检修','波动率','地缘政治','股价', - '同比','环比','环差','裂差','4WMA','变频','道琼斯','标普500','纳斯达克', - '四周均值','名占比','残差','DMA', - '连7-连9','4周平均','4周均值','滚动相关性','日本']): + if any(keyword in data for keyword in ['运费', '检修', '波动率', '地缘政治', '股价', + '同比', '环比', '环差', '裂差', '4WMA', '变频', '道琼斯', '标普500', '纳斯达克', + '四周均值', '名占比', '残差', 'DMA', + '连7-连9', '4周平均', '4周均值', '滚动相关性', '日本']): return False - - # 检查需要的特征 + + # 检查需要的特征 # 去掉 分析 分类下的数据 if ClassifyName == '分析': return False - + # 保留 库存中特殊关键词 if ClassifyName == '库存': - if any(keyword in data for keyword in ['原油' , '美国' ,'全球' ,'中国' ,'富查伊拉','ARA' ]): + if any(keyword in data for keyword in ['原油', '美国', '全球', '中国', '富查伊拉', 'ARA']): return True else: pass else: pass - - # 去掉 持仓中不是基金的数据 + + # 去掉 持仓中不是基金的数据 if ClassifyName == '持仓': if '基金' not in data: return False @@ -1199,11 +1222,11 @@ class EtaReader(): pass else: pass - - # 去掉 航班中不是中国、美国 的数据 + + # 去掉 航班中不是中国、美国 的数据 if ClassifyName == '需求': - if '航班' in data : - if '中国' in data or '美国' in data : + if '航班' in data: + if '中国' in data or '美国' in data: return True else: return False @@ -1211,7 +1234,7 @@ class EtaReader(): pass else: pass - + # 分类为 期货市场,同质性数据取第一个 if ClassifyName == '期货市场': # 去掉c1-9 以后的 @@ -1220,33 +1243,33 @@ class EtaReader(): c = int(data.split('c1-c')[1]) except: return False - if c > 9 : + if c > 9: return False else: pass - + else: pass - + # 判断 同质性数据, 字符串开头 - strstartdict = {'ICE Brent c':"ICE Brent c14", - 'NYMWX WTI c':"NYMWX WTI c5", - 'INE SC c':"INE SC c1", - 'EFS c':"EFS c", - 'Dubai Swap c':"Dubai Swap c1", - 'Oman Swap c':"Oman Swap c1", - 'DME Oman c':"DME Oman c1", - 'Murban Futures c':"Murban Futures c1", - 'Dubai连合约价格':'Dubai连1合约价格', - '美国RBOB期货月份合约价格':'美国RBOB期货2309月份合约价格', - 'Brent连合约价格':'Brent连1合约价格', - 'WTI连合约价格':'WTI连1合约价格', - '布伦特连合约价格':'Brent连1合约价格', - 'Brent 连合约价格':'Brent连1合约价格', - 'Dubai连合约价格':'Dubai连1合约价格', - 'Brent连':'Brent连1合约价格', - 'brent连':'Brent连1合约价格', - } + strstartdict = {'ICE Brent c': "ICE Brent c14", + 'NYMWX WTI c': "NYMWX WTI c5", + 'INE SC c': "INE SC c1", + 'EFS c': "EFS c", + 'Dubai Swap c': "Dubai Swap c1", + 'Oman Swap c': "Oman Swap c1", + 'DME Oman c': "DME Oman c1", + 'Murban Futures c': "Murban Futures c1", + 'Dubai连合约价格': 'Dubai连1合约价格', + '美国RBOB期货月份合约价格': '美国RBOB期货2309月份合约价格', + 'Brent连合约价格': 'Brent连1合约价格', + 'WTI连合约价格': 'WTI连1合约价格', + '布伦特连合约价格': 'Brent连1合约价格', + 'Brent 连合约价格': 'Brent连1合约价格', + 'Dubai连合约价格': 'Dubai连1合约价格', + 'Brent连': 'Brent连1合约价格', + 'brent连': 'Brent连1合约价格', + } # 判断名称字符串开头是否在 strstartdict.keys中 match = re.match(r'([a-zA-Z\s]+)(\d+)', data) if match: @@ -1260,11 +1283,11 @@ class EtaReader(): # data = 'Brent 连7合约价格' # 判断名称字符串去掉数字后是否在 strstartdict.keys中 match = re.findall(r'\D+', data) - if match : + if match: if len(match) == 2: part1 = match[0] part2 = match[1] - if part1+part2 in [i for i in strstartdict.keys()]: + if part1+part2 in [i for i in strstartdict.keys()]: if data == strstartdict[part1+part2]: return True else: @@ -1274,8 +1297,8 @@ class EtaReader(): elif len(match) == 1: match = re.findall(r'\D+', data) part1 = match[0] - - if part1 in [i for i in strstartdict.keys()]: + + if part1 in [i for i in strstartdict.keys()]: if data == strstartdict[part1]: return True else: @@ -1291,11 +1314,11 @@ class EtaReader(): return True - def filter_pp_data(self,ClassifyName,data): + def filter_pp_data(self, ClassifyName, data): ''' 指标名称保留规则 ''' - + # 包含 关键词 去除, 返回flase # if any(keyword in data for keyword in ['运费','检修','波动率','地缘政治','股价', # '同比','环比','环差','裂差','4WMA','变频','道琼斯','标普500','纳斯达克', @@ -1305,16 +1328,14 @@ class EtaReader(): # 包含 关键词 保留, 返回True if any(keyword in data for keyword in ['拉丝']): return True - - - # 检查需要的特征 + # 检查需要的特征 # 去掉 期货市场 分类下的数据 if ClassifyName == '期货市场': return False else: pass - + # 保留 库存 下所有指标 if ClassifyName == '库存': return True @@ -1338,7 +1359,6 @@ class EtaReader(): return True else: pass - # 保留 需求 下所有指标 if ClassifyName == '需求': @@ -1346,22 +1366,22 @@ class EtaReader(): else: pass - return True # 通过edbcode 获取指标数据 - def edbcodegetdata(self,df,EdbCode,EdbName): + def edbcodegetdata(self, df, EdbCode, EdbName): # 根据指标id,获取指标数据 url = self.edbcodedataurl+str(EdbCode) - # 发送GET请求 - response = requests.get(url, headers=self.headers) + # 发送GET请求 + response = requests.get(url, headers=self.headers) - # 检查响应状态码 - if response.status_code == 200: + # 检查响应状态码 + if response.status_code == 200: data = response.json() # 假设接口返回的是JSON数据 all_data_items = data.get('Data') # 列表转换为DataFrame - df3 = pd.DataFrame(all_data_items, columns=['DataTime', 'Value', 'UpdateTime']) + df3 = pd.DataFrame(all_data_items, columns=[ + 'DataTime', 'Value', 'UpdateTime']) # df3 = pd.read_json(all_data_items, orient='records') # 去掉UpdateTime 列 @@ -1369,18 +1389,19 @@ class EtaReader(): # df3.set_index('DataTime') df3.rename(columns={'Value': EdbName}, inplace=True) # 将数据存储df1 - df = pd.merge(df, df3, how='outer',on='DataTime',suffixes= ('', '_y')) + df = pd.merge(df, df3, how='outer', + on='DataTime', suffixes=('', '_y')) # 按时间排序 df = df.sort_values(by='DataTime', ascending=True) return df - + else: # 请求失败,打印错误信息 logger.info(f'Error: {response.status_code}, {response.text}') # 主动抛出异常 raise Exception(f'Error: {response.status_code}, {response.text}') - def get_eta_api_yuanyou_data(self,data_set,dataset=''): + def get_eta_api_yuanyou_data(self, data_set, dataset=''): ''' 从ETA API获取原油数据 @@ -1392,11 +1413,12 @@ class EtaReader(): None ''' today = datetime.date.today().strftime("%Y-%m-%d") - - # 定义你的headers,这里可以包含多个参数 - self.headers = { - 'nonce': self.signature.nonce, # 例如,一个认证令牌 - 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + # 自定义的header参数 + 'timestamp': str(self.signature.timestamp), 'appid': self.signature.APPID, # 另一个自定义的header参数 'signature': self.signature.signature } @@ -1410,122 +1432,133 @@ class EtaReader(): ''' # 构建新的DataFrame df df1 - df = pd.DataFrame(columns=['指标分类', '指标名称', '指标id', '频度','指标来源','来源id','最后更新时间','更新周期','预警日期','停更周期']) + df = pd.DataFrame(columns=[ + '指标分类', '指标名称', '指标id', '频度', '指标来源', '来源id', '最后更新时间', '更新周期', '预警日期', '停更周期']) df1 = pd.DataFrame(columns=['DataTime']) - # 外网环境无法访问,请确认是否为内网环境 try: # 发送GET请求 获取指标分类列表 - response = requests.get(self.classifylisturl, headers=self.headers) + response = requests.get(self.classifylisturl, headers=self.headers) except requests.exceptions.RequestException as e: - raise Exception(f"请求失败,请确认是否为内网环境: {e}","\033[0m") + raise Exception(f"请求失败,请确认是否为内网环境: {e}", "\033[0m") - # 检查响应状态码 - if response.status_code == 200: + # 检查响应状态码 + if response.status_code == 200: # 获取成功, 处理响应内容 - data = response.json() # 假设接口返回的是JSON数据 - - # 请求成功,处理响应内容 - # logger.info(data.get('Data')) - # 定义你想要保留的固定值 - fixed_value = 1214 - - # 遍历列表,只保留那些'category' key的值为固定值的数据项 - filtered_data = [item for item in data.get('Data') if item.get('ParentId') == fixed_value] + data = response.json() # 假设接口返回的是JSON数据 - #然后循环filtered_data去获取list数据,才能获取到想要获取的ClassifyId + # 请求成功,处理响应内容 + # logger.info(data.get('Data')) + # 定义你想要保留的固定值 + fixed_value = 1214 + + # 遍历列表,只保留那些'category' key的值为固定值的数据项 + filtered_data = [item for item in data.get( + 'Data') if item.get('ParentId') == fixed_value] + + # 然后循环filtered_data去获取list数据,才能获取到想要获取的ClassifyId n = 0 for item in filtered_data: - n+= 1 + n += 1 # if n>50: # break - ClassifyId = item["ClassifyId"] #分类id,分类下的指标列表接口的请求参数 - ClassifyName = item["ClassifyName"] #分类名称,要保存到df的指标分类列 + ClassifyId = item["ClassifyId"] # 分类id,分类下的指标列表接口的请求参数 + ClassifyName = item["ClassifyName"] # 分类名称,要保存到df的指标分类列 # 根据分类id,获取指标列表 url = self.classifyidlisturl+str(ClassifyId) response = requests.get(url, headers=self.headers) - if response.status_code == 200: + if response.status_code == 200: # logger.info(response.text) data2 = response.json() Data = data2.get('Data') for i in Data: # s+= 1 EdbCode = i.get('EdbCode') - EdbName = i.get('EdbName') # 指标名称,要保存到df2的指标名称列,df的指标名称列 + # 指标名称,要保存到df2的指标名称列,df的指标名称列 + EdbName = i.get('EdbName') Frequency = i.get('Frequency') # 频度,要保存到df的频度列 SourceName = i.get('SourceName') # 来源名称,要保存到df的频度列 Source = i.get('Source') # 来源ID,要保存到df的频度列 Unit = i.get('Unit') # 单位,要保存到df的单位列 # 频度不是 日 或者 周的 跳过 - if Frequency not in ['日度','周度','日','周']: + if Frequency not in ['日度', '周度', '日', '周']: continue - + # 只保留手工数据中,名称带有 海运出口 海运进口 if Source == 9 and not ('海运出口' in EdbName or '海运进口' in EdbName): continue - + # 不要wind数据 if Source == 2: continue - # 判断名称是否需要保存 - isSave = self.filter_yuanyou_data(ClassifyName,EdbName) + isSave = self.filter_yuanyou_data( + ClassifyName, EdbName) if isSave: # 保存到df - df1 = self.edbcodegetdata(df1,EdbCode,EdbName) + df1 = self.edbcodegetdata(df1, EdbCode, EdbName) # 取df1所有行最后一列 - edbname_df = df1[['DataTime',f'{EdbName}']] + edbname_df = df1[['DataTime', f'{EdbName}']] edbname_df = edbname_df.dropna() - + if len(edbname_df) == 0: logger.info(f'指标名称:{EdbName} 没有数据') continue try: - time_sequence = edbname_df['DataTime'].values.tolist()[-10:] + time_sequence = edbname_df['DataTime'].values.tolist( + )[-10:] except IndexError: - time_sequence = edbname_df['DataTime'].values.tolist() + time_sequence = edbname_df['DataTime'].values.tolist( + ) # 使用Counter来统计每个星期几的出现次数 from collections import Counter - weekday_counter = Counter(datetime.datetime.strptime(time_str, "%Y-%m-%d").strftime('%A') for time_str in time_sequence) + weekday_counter = Counter(datetime.datetime.strptime( + time_str, "%Y-%m-%d").strftime('%A') for time_str in time_sequence) # 打印出现次数最多的星期几 try: - most_common_weekday = weekday_counter.most_common(1)[0][0] + most_common_weekday = weekday_counter.most_common(1)[ + 0][0] # 计算两周后的日期 - warning_date = (datetime.datetime.strptime(time_sequence[-1], "%Y-%m-%d") + datetime.timedelta(weeks=2)).strftime("%Y-%m-%d") - stop_update_period = (datetime.datetime.strptime(today, "%Y-%m-%d") - datetime.datetime.strptime(time_sequence[-1], "%Y-%m-%d")).days // 7 + warning_date = (datetime.datetime.strptime( + time_sequence[-1], "%Y-%m-%d") + datetime.timedelta(weeks=2)).strftime("%Y-%m-%d") + stop_update_period = (datetime.datetime.strptime( + today, "%Y-%m-%d") - datetime.datetime.strptime(time_sequence[-1], "%Y-%m-%d")).days // 7 except IndexError: most_common_weekday = '其他' stop_update_period = 0 if '日' in Frequency: most_common_weekday = '每天' - warning_date = (datetime.datetime.strptime(time_sequence[-1], "%Y-%m-%d") + datetime.timedelta(days=3)).strftime("%Y-%m-%d") - stop_update_period = (datetime.datetime.strptime(today, "%Y-%m-%d") - datetime.datetime.strptime(time_sequence[-1], "%Y-%m-%d")).days + warning_date = (datetime.datetime.strptime( + time_sequence[-1], "%Y-%m-%d") + datetime.timedelta(days=3)).strftime("%Y-%m-%d") + stop_update_period = (datetime.datetime.strptime( + today, "%Y-%m-%d") - datetime.datetime.strptime(time_sequence[-1], "%Y-%m-%d")).days # 保存频度 指标名称 分类 指标id 到 df - df2 = pd.DataFrame({'指标分类': ClassifyName, - '指标名称': EdbName, - '指标id': EdbCode, + df2 = pd.DataFrame({'指标分类': ClassifyName, + '指标名称': EdbName, + '指标id': EdbCode, '单位': Unit, '频度': Frequency, - '指标来源':SourceName, - '来源id':Source, - '最后更新时间':edbname_df['DataTime'].values[-1], - '更新周期':most_common_weekday, - '预警日期':warning_date, - '停更周期':stop_update_period},index=[0], + '指标来源': SourceName, + '来源id': Source, + '最后更新时间': edbname_df['DataTime'].values[-1], + '更新周期': most_common_weekday, + '预警日期': warning_date, + '停更周期': stop_update_period}, index=[0], ) - - # df = pd.merge(df, df2, how='outer') + + # df = pd.merge(df, df2, how='outer') df = pd.concat([df, df2]) else: logger.info(f'跳过指标 {EdbName}') # 找到列表中不在指标列中的指标id,保存成新的list - new_list = [item for item in self.edbcodelist if item not in df['指标id'].tolist()] + new_list = [ + item for item in self.edbcodelist if item not in df['指标id'].tolist()] logger.info(new_list) # 遍历new_list,获取指标数据,保存到df1 for item in new_list: @@ -1536,12 +1569,13 @@ class EtaReader(): except: itemname = item - df1 = self.edbcodegetdata(df1,item,itemname) - df = pd.concat([df, pd.DataFrame({'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他','指标来源':'其他','来源id':'其他'},index=[0])]) + df1 = self.edbcodegetdata(df1, item, itemname) + df = pd.concat([df, pd.DataFrame( + {'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他', '指标来源': '其他', '来源id': '其他'}, index=[0])]) # 按时间排序 - df1.sort_values('DataTime',inplace=True,ascending=False) - df1.rename(columns={'DataTime': 'date'},inplace=True) + df1.sort_values('DataTime', inplace=True, ascending=False) + df1.rename(columns={'DataTime': 'date'}, inplace=True) # df1.dropna(inplace=True) # 去掉大于今天日期的行 df1 = df1[df1['date'] <= datetime.datetime.now().strftime('%Y-%m-%d')] @@ -1550,16 +1584,17 @@ class EtaReader(): df_zhibiaoshuju = df1.copy() df_zhibiaoliebiao = df.copy() - return df_zhibiaoshuju,df_zhibiaoliebiao + return df_zhibiaoshuju, df_zhibiaoliebiao - def get_eta_api_pp_data(self,data_set,dataset=''): + def get_eta_api_pp_data(self, data_set, dataset=''): global ClassifyId today = datetime.date.today().strftime("%Y-%m-%d") - - # 定义你的headers,这里可以包含多个参数 - self.headers = { - 'nonce': self.signature.nonce, # 例如,一个认证令牌 - 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + # 自定义的header参数 + 'timestamp': str(self.signature.timestamp), 'appid': self.signature.APPID, # 另一个自定义的header参数 'signature': self.signature.signature } @@ -1576,66 +1611,69 @@ class EtaReader(): df = pd.DataFrame(columns=['指标分类', '指标名称', '指标id', '频度']) df1 = pd.DataFrame(columns=['DataTime']) - # 外网环境无法访问,请确认是否为内网环境 try: # 发送GET请求 获取指标分类列表 - response = requests.get(self.classifylisturl, headers=self.headers) + response = requests.get(self.classifylisturl, headers=self.headers) except requests.exceptions.RequestException as e: - raise Exception(f"请求失败,请确认是否为内网环境: {e}","\033[0m") + raise Exception(f"请求失败,请确认是否为内网环境: {e}", "\033[0m") - # 检查响应状态码 - if response.status_code == 200: + # 检查响应状态码 + if response.status_code == 200: # 获取成功, 处理响应内容 - data = response.json() # 假设接口返回的是JSON数据 - - # 请求成功,处理响应内容 - # logger.info(data.get('Data')) - # 定义你想要保留的固定值 - fixed_value = ClassifyId - - # 遍历列表,只保留那些'category' key的值为固定值的数据项 - filtered_data = [item for item in data.get('Data') if item.get('ParentId') == fixed_value] + data = response.json() # 假设接口返回的是JSON数据 - #然后循环filtered_data去获取list数据,才能获取到想要获取的ClassifyId + # 请求成功,处理响应内容 + # logger.info(data.get('Data')) + # 定义你想要保留的固定值 + fixed_value = ClassifyId + + # 遍历列表,只保留那些'category' key的值为固定值的数据项 + filtered_data = [item for item in data.get( + 'Data') if item.get('ParentId') == fixed_value] + + # 然后循环filtered_data去获取list数据,才能获取到想要获取的ClassifyId n = 0 for item in filtered_data: - n+= 1 + n += 1 # if n>50: # break - ClassifyId = item["ClassifyId"] #分类id,分类下的指标列表接口的请求参数 - ClassifyName = item["ClassifyName"] #分类名称,要保存到df的指标分类列 + ClassifyId = item["ClassifyId"] # 分类id,分类下的指标列表接口的请求参数 + ClassifyName = item["ClassifyName"] # 分类名称,要保存到df的指标分类列 # 根据分类id,获取指标列表 url = self.classifyidlisturl+str(ClassifyId) response = requests.get(url, headers=self.headers) - if response.status_code == 200: + if response.status_code == 200: # logger.info(response.text) data2 = response.json() Data = data2.get('Data') for i in Data: # s+= 1 EdbCode = i.get('EdbCode') - EdbName = i.get('EdbName') # 指标名称,要保存到df2的指标名称列,df的指标名称列 + # 指标名称,要保存到df2的指标名称列,df的指标名称列 + EdbName = i.get('EdbName') Frequency = i.get('Frequency') # 频度,要保存到df的频度列 # 频度不是 日 或者 周的 跳过 - if Frequency not in ['日度','周度','日','周']: + if Frequency not in ['日度', '周度', '日', '周']: continue - + # 判断名称是否需要保存 - isSave = self.filter_pp_data(ClassifyName,EdbName) + isSave = self.filter_pp_data(ClassifyName, EdbName) if isSave: # 保存到df # 保存频度 指标名称 分类 指标id 到 df - df2 = pd.DataFrame({'指标分类': ClassifyName, '指标名称': EdbName, '指标id': EdbCode, '频度': Frequency},index=[0]) - - # df = pd.merge(df, df2, how='outer') + df2 = pd.DataFrame( + {'指标分类': ClassifyName, '指标名称': EdbName, '指标id': EdbCode, '频度': Frequency}, index=[0]) + + # df = pd.merge(df, df2, how='outer') df = pd.concat([df, df2]) - df1 = self.edbcodegetdata(df1,EdbCode,EdbName) + df1 = self.edbcodegetdata(df1, EdbCode, EdbName) else: logger.info(f'跳过指标 {EdbName}') # 找到列表中不在指标列中的指标id,保存成新的list - new_list = [item for item in self.edbcodelist if item not in df['指标id'].tolist()] + new_list = [ + item for item in self.edbcodelist if item not in df['指标id'].tolist()] logger.info(new_list) # 遍历new_list,获取指标数据,保存到df1 for item in new_list: @@ -1646,85 +1684,89 @@ class EtaReader(): except: itemname = item - df1 = self.edbcodegetdata(df1,item,itemname) - df = pd.concat([df, pd.DataFrame({'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他'},index=[0])]) + df1 = self.edbcodegetdata(df1, item, itemname) + df = pd.concat([df, pd.DataFrame( + {'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他'}, index=[0])]) # 按时间排序 - df1.sort_values('DataTime',inplace=True,ascending=False) - df1.rename(columns={'DataTime': 'date'},inplace=True) + df1.sort_values('DataTime', inplace=True, ascending=False) + df1.rename(columns={'DataTime': 'date'}, inplace=True) # df1.dropna(inplace=True) # 去掉大于今天日期的行 df1 = df1[df1['date'] <= datetime.datetime.now().strftime('%Y-%m-%d')] logger.info(df1.head()) # logger.info(f'{df1.head()}') # 保存到xlsx文件的sheet表 - with pd.ExcelWriter(os.path.join(dataset,data_set)) as file: + with pd.ExcelWriter(os.path.join(dataset, data_set)) as file: df1.to_excel(file, sheet_name='指标数据', index=False) df.to_excel(file, sheet_name='指标列表', index=False) df_zhibiaoshuju = df1.copy() df_zhibiaoliebiao = df.copy() - return df_zhibiaoshuju,df_zhibiaoliebiao + return df_zhibiaoshuju, df_zhibiaoliebiao - def push_data(self,data): + def push_data(self, data): today = datetime.date.today().strftime("%Y-%m-%d") - - # 定义你的headers,这里可以包含多个参数 - self.headers = { - 'nonce': self.signature.nonce, # 例如,一个认证令牌 - 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + # 自定义的header参数 + 'timestamp': str(self.signature.timestamp), 'appid': self.signature.APPID, # 另一个自定义的header参数 'signature': self.signature.signature } # 发送post请求 上传数据 logger.info(f'请求参数:{data}') - response = requests.post(self.edbdatapushurl, headers=self.headers,data=json.dumps(data)) + response = requests.post( + self.edbdatapushurl, headers=self.headers, data=json.dumps(data)) - # 检查响应状态码 - if response.status_code == 200: + # 检查响应状态码 + if response.status_code == 200: data = response.json() # 假设接口返回的是JSON数据 - + logger.info(f'上传成功,响应为:{data}') - + else: # 请求失败,打印错误信息 logger.info(f'Error: {response.status_code}, {response.text}') # 主动抛出异常 raise Exception(f'Error: {response.status_code}, {response.text}') - def del_zhibiao(self,IndexCodeList): + def del_zhibiao(self, IndexCodeList): today = datetime.date.today().strftime("%Y-%m-%d") - - # 定义你的headers,这里可以包含多个参数 - self.headers = { - 'nonce': self.signature.nonce, # 例如,一个认证令牌 - 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + # 自定义的header参数 + 'timestamp': str(self.signature.timestamp), 'appid': self.signature.APPID, # 另一个自定义的header参数 'signature': self.signature.signature } data = { - "IndexCodeList": IndexCodeList #指标编码列表 - } + "IndexCodeList": IndexCodeList # 指标编码列表 + } # 发送post请求 上传数据 - response = requests.post(self.edbdeleteurl, headers=self.headers,data=json.dumps(data)) + response = requests.post( + self.edbdeleteurl, headers=self.headers, data=json.dumps(data)) - - # 检查响应状态码 - if response.status_code == 200: + # 检查响应状态码 + if response.status_code == 200: data = response.json() # 假设接口返回的是JSON数据 - + logger.info('删除成功,响应为:', data) - + else: # 请求失败,打印错误信息 logger.info(f'Error: {response.status_code}, {response.text}') # 主动抛出异常 raise Exception(f'Error: {response.status_code}, {response.text}') - def del_business(self,data): + def del_business(self, data): '''' 接口地址 https://console-docs.apipost.cn/preview/fce869601d0be1d9/9a637c2f9ed0c589?target_id=d3cafcbf-a68c-42b3-b105-7bbd0e95a9cd @@ -1737,26 +1779,26 @@ class EtaReader(): } ''' today = datetime.date.today().strftime("%Y-%m-%d") - - # 定义你的headers,这里可以包含多个参数 - self.headers = { - 'nonce': self.signature.nonce, # 例如,一个认证令牌 - 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + # 自定义的header参数 + 'timestamp': str(self.signature.timestamp), 'appid': self.signature.APPID, # 另一个自定义的header参数 'signature': self.signature.signature } - # 发送post请求 上传数据 - response = requests.post(self.edbbusinessurl, headers=self.headers,data=json.dumps(data)) + response = requests.post( + self.edbbusinessurl, headers=self.headers, data=json.dumps(data)) - - # 检查响应状态码 - if response.status_code == 200: + # 检查响应状态码 + if response.status_code == 200: data = response.json() # 假设接口返回的是JSON数据 - + logger.info('删除成功,响应为:', data) - + else: # 请求失败,打印错误信息 logger.info(f'Error: {response.status_code}, {response.text}') @@ -1764,18 +1806,20 @@ class EtaReader(): raise Exception(f'Error: {response.status_code}, {response.text}') -def get_market_data(end_time,df): +def get_market_data(end_time, df): """ 获取市场数据,拼接到df中 """ # 获取token token = get_head_auth_report() # 定义请求参数 - query_data_list_item_nos_data['data']['dateEnd'] = end_time.replace('-','') + query_data_list_item_nos_data['data']['dateEnd'] = end_time.replace( + '-', '') # 发送请求 headers = {"Authorization": token} logger.info('获取数据中...') - items_res = requests.post(url=query_data_list_item_nos_url, headers=headers, json=query_data_list_item_nos_data, timeout=(3, 35)) + items_res = requests.post(url=query_data_list_item_nos_url, headers=headers, + json=query_data_list_item_nos_data, timeout=(3, 35)) json_data = json.loads(items_res.text) logger.info(f"获取到的数据:{json_data}") df3 = pd.DataFrame(json_data['data']) @@ -1793,20 +1837,21 @@ def get_market_data(end_time,df): # 20240101 转换为 2024-01-01 df2['date'] = pd.to_datetime(df2['date'], format='%Y%m%d') df2['date'] = df2['date'].dt.strftime('%Y-%m-%d') - df = pd.merge(df, df2, how='left',on='date') + df = pd.merge(df, df2, how='left', on='date') return df def get_high_low_data(df): # 读取excel 从第五行开始 - df1 = pd.read_excel(os.path.join(dataset,'数据项下载.xls'),header=5, names=['numid','date', 'Brentzdj', 'Brentzgj']) + df1 = pd.read_excel(os.path.join(dataset, '数据项下载.xls'), header=5, names=[ + 'numid', 'date', 'Brentzdj', 'Brentzgj']) # 合并数据 - df = pd.merge(df, df1, how='left',on='date') + df = pd.merge(df, df1, how='left', on='date') return df - + # 时间特征,年,月,一年的多少天,周几,第几周,第几季度,每月的第几天, 每季度的第几天,是否每月的第一天,是否每月的最后一天,是否每季度的第一天,是否每季度的最后一天,是否每年的第一天,是否每年的最后一天 -def addtimecharacteristics(df,dataset): +def addtimecharacteristics(df, dataset): """ 为输入的 DataFrame 添加日期相关信息列 @@ -1823,12 +1868,12 @@ def addtimecharacteristics(df,dataset): df['weekofyear'] = df['ds'].dt.isocalendar().week df['dayofyear'] = df['ds'].dt.dayofyear df['quarternum'] = df['ds'].dt.quarter - # 将ds列转换为季度Period对象 - df['quarter'] = df['ds'].dt.to_period('Q') - # 获取每个季度的开始日期 - df['quarter_start'] = df['quarter'].dt.to_timestamp('s') - # 计算每个日期是所在季度的第几天 - df['dayofquarter'] = (df['ds'] - df['quarter_start']).dt.days + 1 + # 将ds列转换为季度Period对象 + df['quarter'] = df['ds'].dt.to_period('Q') + # 获取每个季度的开始日期 + df['quarter_start'] = df['quarter'].dt.to_timestamp('s') + # 计算每个日期是所在季度的第几天 + df['dayofquarter'] = (df['ds'] - df['quarter_start']).dt.days + 1 # 是否月初 df['is_month_start'] = df['ds'].dt.is_month_start.astype(int) # 是否月末 @@ -1843,14 +1888,18 @@ def addtimecharacteristics(df,dataset): df['is_year_end'] = df['ds'].dt.is_year_end.astype(int) # 添加月度第几周(周一到周日为一周,每月1日所在的周为第一周) # 计算当前日期所在周的周一 - df['current_monday'] = df['ds'] - pd.to_timedelta(df['ds'].dt.dayofweek, unit='D') + df['current_monday'] = df['ds'] - \ + pd.to_timedelta(df['ds'].dt.dayofweek, unit='D') # 计算当月1日所在周的周一 - df['first_monday'] = df['ds'].dt.to_period('M').dt.start_time - pd.to_timedelta(df['ds'].dt.to_period('M').dt.start_time.dt.dayofweek, unit='D') + df['first_monday'] = df['ds'].dt.to_period('M').dt.start_time - pd.to_timedelta( + df['ds'].dt.to_period('M').dt.start_time.dt.dayofweek, unit='D') # 计算周数差并+1得到周数 - df['weekofmonth'] = ((df['current_monday'] - df['first_monday']).dt.days // 7) + 1 - df['yearmonthweeks'] = df['year'].astype(str) + df['month'].astype(str) + df['weekofmonth'].astype(str) + df['weekofmonth'] = ( + (df['current_monday'] - df['first_monday']).dt.days // 7) + 1 + df['yearmonthweeks'] = df['year'].astype( + str) + df['month'].astype(str) + df['weekofmonth'].astype(str) df.drop(columns=['current_monday', 'first_monday'], inplace=True) - # 去掉 quarter_start quarter - df.drop(columns=['quarter_start','quarter'],inplace=True) - df.to_csv(os.path.join(dataset,'指标数据添加时间特征.csv'), index=False) + # 去掉 quarter_start quarter + df.drop(columns=['quarter_start', 'quarter'], inplace=True) + df.to_csv(os.path.join(dataset, '指标数据添加时间特征.csv'), index=False) return df diff --git a/main_yuanyou_zhoudu.py b/main_yuanyou_zhoudu.py index f5fc47b..441b236 100644 --- a/main_yuanyou_zhoudu.py +++ b/main_yuanyou_zhoudu.py @@ -1,14 +1,15 @@ # 读取配置 + from lib.dataread import * -from lib.tools import SendMail,exception_logger -from models.nerulforcastmodels import ex_Model,model_losss,model_losss_juxiting,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf,model_losss_juxiting +# from config_jingbo_zhoudu import * +from lib.tools import SendMail, exception_logger +from models.nerulforcastmodels import ex_Model, model_losss, model_losss_juxiting, brent_export_pdf, tansuanli_export_pdf, pp_export_pdf, model_losss_juxiting import glob import torch torch.set_float32_matmul_precision("high") - def predict_main(): """ 主预测函数,用于从 ETA 获取数据、处理数据、训练模型并进行预测。 @@ -72,7 +73,8 @@ def predict_main(): edbdeleteurl=edbdeleteurl, edbbusinessurl=edbbusinessurl, ) - df_zhibiaoshuju, df_zhibiaoliebiao = etadata.get_eta_api_yuanyou_data(data_set=data_set, dataset=dataset) # 原始数据,未处理 + df_zhibiaoshuju, df_zhibiaoliebiao = etadata.get_eta_api_yuanyou_data( + data_set=data_set, dataset=dataset) # 原始数据,未处理 if is_market: logger.info('从市场信息平台获取数据...') @@ -83,26 +85,26 @@ def predict_main(): df_zhibiaoshuju = get_high_low_data(df_zhibiaoshuju) else: logger.info('从市场信息平台获取数据') - df_zhibiaoshuju = get_market_data(end_time,df_zhibiaoshuju) - - except : + df_zhibiaoshuju = get_market_data( + end_time, df_zhibiaoshuju) + + except: logger.info('最高最低价拼接失败') - + # 保存到xlsx文件的sheet表 - with pd.ExcelWriter(os.path.join(dataset,data_set)) as file: + with pd.ExcelWriter(os.path.join(dataset, data_set)) as file: df_zhibiaoshuju.to_excel(file, sheet_name='指标数据', index=False) df_zhibiaoliebiao.to_excel(file, sheet_name='指标列表', index=False) - - + # 数据处理 df = datachuli(df_zhibiaoshuju, df_zhibiaoliebiao, y=y, dataset=dataset, add_kdj=add_kdj, is_timefurture=is_timefurture, - end_time=end_time) + end_time=end_time) else: # 读取数据 logger.info('读取本地数据:' + os.path.join(dataset, data_set)) - df,df_zhibiaoliebiao = getdata(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj, - is_timefurture=is_timefurture, end_time=end_time) # 原始数据,未处理 + df, df_zhibiaoliebiao = getdata(filename=os.path.join(dataset, data_set), y=y, dataset=dataset, add_kdj=add_kdj, + is_timefurture=is_timefurture, end_time=end_time) # 原始数据,未处理 # 更改预测列名称 df.rename(columns={y: 'y'}, inplace=True) @@ -126,31 +128,37 @@ def predict_main(): row_dict = row._asdict() # row_dict['ds'] = row_dict['ds'].strftime('%Y-%m-%d') # row_dict['ds'] = row_dict['ds'].strftime('%Y-%m-%d %H:%M:%S') - check_query = sqlitedb.select_data('trueandpredict', where_condition=f"ds = '{row.ds}'") + check_query = sqlitedb.select_data( + 'trueandpredict', where_condition=f"ds = '{row.ds}'") if len(check_query) > 0: - set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) - sqlitedb.update_data('trueandpredict', set_clause, where_condition=f"ds = '{row.ds}'") + set_clause = ", ".join( + [f"{key} = '{value}'" for key, value in row_dict.items()]) + sqlitedb.update_data( + 'trueandpredict', set_clause, where_condition=f"ds = '{row.ds}'") continue - sqlitedb.insert_data('trueandpredict', tuple(row_dict.values()), columns=row_dict.keys()) + sqlitedb.insert_data('trueandpredict', tuple( + row_dict.values()), columns=row_dict.keys()) # 更新accuracy表的y值 if not sqlitedb.check_table_exists('accuracy'): pass else: - update_y = sqlitedb.select_data('accuracy',where_condition="y is null") + update_y = sqlitedb.select_data( + 'accuracy', where_condition="y is null") if len(update_y) > 0: logger.info('更新accuracy表的y值') # 找到update_y 中ds且df中的y的行 - update_y = update_y[update_y['ds']<=end_time] + update_y = update_y[update_y['ds'] <= end_time] logger.info(f'要更新y的信息:{update_y}') # try: for row in update_y.itertuples(index=False): try: - row_dict = row._asdict() - yy = df[df['ds']==row_dict['ds']]['y'].values[0] - LOW = df[df['ds']==row_dict['ds']]['Brentzdj'].values[0] - HIGH = df[df['ds']==row_dict['ds']]['Brentzgj'].values[0] - sqlitedb.update_data('accuracy', f"y = {yy},LOW_PRICE = {LOW},HIGH_PRICE = {HIGH}", where_condition=f"ds = '{row_dict['ds']}'") + row_dict = row._asdict() + yy = df[df['ds'] == row_dict['ds']]['y'].values[0] + LOW = df[df['ds'] == row_dict['ds']]['Brentzdj'].values[0] + HIGH = df[df['ds'] == row_dict['ds']]['Brentzgj'].values[0] + sqlitedb.update_data( + 'accuracy', f"y = {yy},LOW_PRICE = {LOW},HIGH_PRICE = {HIGH}", where_condition=f"ds = '{row_dict['ds']}'") except: logger.info(f'更新accuracy表的y值失败:{row_dict}') # except Exception as e: @@ -162,10 +170,12 @@ def predict_main(): if is_weekday: logger.info('今天是周一,更新预测模型') # 计算最近60天预测残差最低的模型名称 - model_results = sqlitedb.select_data('trueandpredict', order_by="ds DESC", limit="60") + model_results = sqlitedb.select_data( + 'trueandpredict', order_by="ds DESC", limit="60") # 删除空值率为90%以上的列 if len(model_results) > 10: - model_results = model_results.dropna(thresh=len(model_results)*0.1,axis=1) + model_results = model_results.dropna( + thresh=len(model_results)*0.1, axis=1) # 删除空行 model_results = model_results.dropna() modelnames = model_results.columns.to_list()[2:-1] @@ -173,47 +183,59 @@ def predict_main(): model_results[col] = model_results[col].astype(np.float32) # 计算每个预测值与真实值之间的偏差率 for model in modelnames: - model_results[f'{model}_abs_error_rate'] = abs(model_results['y'] - model_results[model]) / model_results['y'] + model_results[f'{model}_abs_error_rate'] = abs( + model_results['y'] - model_results[model]) / model_results['y'] # 获取每行对应的最小偏差率值 - min_abs_error_rate_values = model_results.apply(lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].min(), axis=1) + min_abs_error_rate_values = model_results.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].min(), axis=1) # 获取每行对应的最小偏差率值对应的列名 - min_abs_error_rate_column_name = model_results.apply(lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].idxmin(), axis=1) + min_abs_error_rate_column_name = model_results.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].idxmin(), axis=1) # 将列名索引转换为列名 - min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0]) + min_abs_error_rate_column_name = min_abs_error_rate_column_name.map( + lambda x: x.split('_')[0]) # 取出现次数最多的模型名称 most_common_model = min_abs_error_rate_column_name.value_counts().idxmax() logger.info(f"最近60天预测残差最低的模型名称:{most_common_model}") # 保存结果到数据库 if not sqlitedb.check_table_exists('most_model'): - sqlitedb.create_table('most_model', columns="ds datetime, most_common_model TEXT") - sqlitedb.insert_data('most_model', (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), most_common_model,), columns=('ds', 'most_common_model',)) + sqlitedb.create_table( + 'most_model', columns="ds datetime, most_common_model TEXT") + sqlitedb.insert_data('most_model', (datetime.datetime.now().strftime( + '%Y-%m-%d %H:%M:%S'), most_common_model,), columns=('ds', 'most_common_model',)) try: if is_weekday: - # if True: + # if True: logger.info('今天是周一,发送特征预警') # 上传预警信息到数据库 warning_data_df = df_zhibiaoliebiao.copy() - warning_data_df = warning_data_df[warning_data_df['停更周期']> 3 ][['指标名称', '指标id', '频度','更新周期','指标来源','最后更新时间','停更周期']] + warning_data_df = warning_data_df[warning_data_df['停更周期'] > 3][[ + '指标名称', '指标id', '频度', '更新周期', '指标来源', '最后更新时间', '停更周期']] # 重命名列名 - warning_data_df = warning_data_df.rename(columns={'指标名称': 'INDICATOR_NAME', '指标id': 'INDICATOR_ID', '频度': 'FREQUENCY', '更新周期': 'UPDATE_FREQUENCY', '指标来源': 'DATA_SOURCE', '最后更新时间': 'LAST_UPDATE_DATE', '停更周期': 'UPDATE_SUSPENSION_CYCLE'}) + warning_data_df = warning_data_df.rename(columns={'指标名称': 'INDICATOR_NAME', '指标id': 'INDICATOR_ID', '频度': 'FREQUENCY', + '更新周期': 'UPDATE_FREQUENCY', '指标来源': 'DATA_SOURCE', '最后更新时间': 'LAST_UPDATE_DATE', '停更周期': 'UPDATE_SUSPENSION_CYCLE'}) from sqlalchemy import create_engine import urllib global password if '@' in password: password = urllib.parse.quote_plus(password) - engine = create_engine(f'mysql+pymysql://{dbusername}:{password}@{host}:{port}/{dbname}') - warning_data_df['WARNING_DATE'] = datetime.date.today().strftime("%Y-%m-%d %H:%M:%S") - warning_data_df['TENANT_CODE'] = 'T0004' + engine = create_engine( + f'mysql+pymysql://{dbusername}:{password}@{host}:{port}/{dbname}') + warning_data_df['WARNING_DATE'] = datetime.date.today().strftime( + "%Y-%m-%d %H:%M:%S") + warning_data_df['TENANT_CODE'] = 'T0004' # 插入数据之前查询表数据然后新增id列 existing_data = pd.read_sql(f"SELECT * FROM {table_name}", engine) if not existing_data.empty: max_id = existing_data['ID'].astype(int).max() - warning_data_df['ID'] = range(max_id + 1, max_id + 1 + len(warning_data_df)) + warning_data_df['ID'] = range( + max_id + 1, max_id + 1 + len(warning_data_df)) else: warning_data_df['ID'] = range(1, 1 + len(warning_data_df)) - warning_data_df.to_sql(table_name, con=engine, if_exists='append', index=False) + warning_data_df.to_sql( + table_name, con=engine, if_exists='append', index=False) if is_update_warning_data: upload_warning_info(len(warning_data_df)) except: @@ -248,30 +270,29 @@ def predict_main(): end_time=end_time, ) - logger.info('模型训练完成') - + logger.info('训练数据绘图ing') - model_results3 = model_losss(sqlitedb,end_time=end_time) + model_results3 = model_losss(sqlitedb, end_time=end_time) logger.info('训练数据绘图end') - + # # 模型报告 logger.info('制作报告ing') - title = f'{settings}--{end_time}-预测报告' # 报告标题 - reportname = f'Brent原油大模型周度预测--{end_time}.pdf' # 报告文件名 - reportname = reportname.replace(':', '-') # 替换冒号 - brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time, - reportname=reportname,sqlitedb=sqlitedb), + title = f'{settings}--{end_time}-预测报告' # 报告标题 + reportname = f'Brent原油大模型周度预测--{end_time}.pdf' # 报告文件名 + reportname = reportname.replace(':', '-') # 替换冒号 + brent_export_pdf(dataset=dataset, num_models=5 if is_fivemodels else 22, time=end_time, + reportname=reportname, sqlitedb=sqlitedb), logger.info('制作报告end') logger.info('模型训练完成') # # LSTM 单变量模型 # ex_Lstm(df,input_seq_len=input_size,output_seq_len=horizon,is_debug=is_debug,dataset=dataset) - + # # lstm 多变量模型 # ex_Lstm_M(df,n_days=input_size,out_days=horizon,is_debug=is_debug,datasetpath=dataset) - + # # GRU 模型 # # ex_GRU(df) @@ -285,7 +306,7 @@ def predict_main(): # file=max(glob.glob(os.path.join(dataset,'*.pdf')), key=os.path.getctime), # ssl=ssl, # ) - # m.send_mail() + # m.send_mail() if __name__ == '__main__': @@ -295,4 +316,4 @@ if __name__ == '__main__': # end_time = i_time.strftime('%Y-%m-%d') # predict_main() - predict_main() \ No newline at end of file + predict_main() diff --git a/models/nerulforcastmodels.py b/models/nerulforcastmodels.py index 90f5c92..621a0b3 100644 --- a/models/nerulforcastmodels.py +++ b/models/nerulforcastmodels.py @@ -6,13 +6,13 @@ import seaborn as sns import matplotlib.pyplot as plt import matplotlib.dates as mdates import datetime -from lib.tools import Graphs,mse,rmse,mae,exception_logger -from lib.tools import save_to_database,get_week_date +from lib.tools import Graphs, mse, rmse, mae, exception_logger +from lib.tools import save_to_database, get_week_date from lib.dataread import * from neuralforecast import NeuralForecast -from neuralforecast.models import NHITS,Informer, NBEATSx,LSTM,PatchTST, iTransformer, TSMixer +from neuralforecast.models import NHITS, Informer, NBEATSx, LSTM, PatchTST, iTransformer, TSMixer from neuralforecast.models import RNN, GRU, TCN, DeepAR, DilatedRNN, MLP, NBEATS, DLinear, NLinear, TFT, VanillaTransformer -from neuralforecast.models import Autoformer, PatchTST, FEDformer, StemGNN, HINT, TSMixer, TSMixerx, MLPMultivariate, BiTCN, TiDE, DeepNPTS +from neuralforecast.models import Autoformer, PatchTST, FEDformer, StemGNN, HINT, TSMixer, TSMixerx, MLPMultivariate, BiTCN, TiDE, DeepNPTS from tensorflow.keras.losses import MAE from scipy.stats import spearmanr from sklearn.preprocessing import MinMaxScaler @@ -24,7 +24,7 @@ from lib.duojinchengpredict import testSetPredict from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类 from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch) from reportlab.pdfbase import pdfmetrics # 注册字体 -from reportlab.pdfbase.ttfonts import TTFont # 字体类 +from reportlab.pdfbase.ttfonts import TTFont # 字体类 from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类 from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch) from reportlab.lib.styles import getSampleStyleSheet # 文本样式 @@ -38,9 +38,9 @@ pdfmetrics.registerFont(TTFont('SimSun', 'SimSun.ttf')) @exception_logger -def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patience_steps, - is_debug,dataset,is_train,is_fivemodels,val_size,test_size,settings,now, - etadata,modelsindex,data,is_eta,end_time): +def ex_Model(df, horizon, input_size, train_steps, val_check_steps, early_stop_patience_steps, + is_debug, dataset, is_train, is_fivemodels, val_size, test_size, settings, now, + etadata, modelsindex, data, is_eta, end_time): ''' 模型训练与预测 :param df: 数据集 @@ -83,10 +83,11 @@ def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patien # 'DeepNPT', # ] - df= df.replace(',', '', regex=True) + df = df.replace(',', '', regex=True) df = df.rename(columns={'date': 'ds'}) df['y'] = pd.to_numeric(df['y'], errors='coerce') - df['ds'] = pd.to_datetime(df['ds'], errors='coerce') # 使用errors='coerce'来处理无效日期 + # 使用errors='coerce'来处理无效日期 + df['ds'] = pd.to_datetime(df['ds'], errors='coerce') # df 数值列转为 float32 for col in df.select_dtypes(include=['int']).columns: df[col] = df[col].astype(np.float32) @@ -95,8 +96,6 @@ def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patien plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 - - # 不筛选特征用下面的 df_reg = df df_reg.sort_values('ds', inplace=True) @@ -104,7 +103,7 @@ def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patien df_reg = df_reg[-1000:-1] # 计算训练集的结束索引,占总数据的90% - split_index = int(0.9* len(df_reg)) + split_index = int(0.9 * len(df_reg)) # 按照时间顺序划分训练集和测试集 df_train = df_reg[:split_index] @@ -119,46 +118,66 @@ def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patien logger.info("\nTesting set head:") logger.info(df_test.head()) - models = [ - NHITS (h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', activation='ReLU', early_stop_patience_steps=early_stop_patience_steps), - Informer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps ), - LSTM(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - iTransformer(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - TSMixer(h=horizon, input_size=input_size, n_series = 1, max_steps=train_steps, early_stop_patience_steps=early_stop_patience_steps), - TSMixerx(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, early_stop_patience_steps=early_stop_patience_steps), - PatchTST(h=horizon, input_size=input_size, max_steps=train_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - RNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - GRU(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - TCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + NHITS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', activation='ReLU', early_stop_patience_steps=early_stop_patience_steps), + Informer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + LSTM(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + iTransformer(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TSMixer(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, + early_stop_patience_steps=early_stop_patience_steps), + TSMixerx(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, + early_stop_patience_steps=early_stop_patience_steps), + PatchTST(h=horizon, input_size=input_size, max_steps=train_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + RNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + GRU(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), # DeepAR(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - BiTCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - DilatedRNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - MLP(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - DLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - NLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - TFT(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - FEDformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - StemGNN(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - MLPMultivariate(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - TiDE(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - DeepNPTS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - + BiTCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DilatedRNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + MLP(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + NLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TFT(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + FEDformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + StemGNN(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + MLPMultivariate(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, + val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TiDE(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DeepNPTS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + # VanillaTransformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), //报错了 # Autoformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), //报错了 # NBEATS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), # NBEATSx (h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard',activation='ReLU', ), //报错 # HINT(h=horizon), - + ] if is_fivemodels: # 获取之前存好的最好的五个模型 - with open(os.path.join(dataset,'best_modelnames.txt'), 'r',encoding='utf-8') as f: + with open(os.path.join(dataset, 'best_modelnames.txt'), 'r', encoding='utf-8') as f: best_modelnames = f.readlines()[0] logger.info(f'获取本地最佳模型名称:{best_modelnames}') - - # 重新拼接models + + # 重新拼接models all_models = models models = [] for model in all_models: @@ -171,45 +190,51 @@ def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patien from joblib import dump, load if is_train: # 模型交叉验证 - nf_preds = nf.cross_validation(df=df_train, val_size=val_size, test_size=test_size, n_windows=None) - nf_preds.to_csv(os.path.join(dataset,"cross_validation.csv"),index=False) - + nf_preds = nf.cross_validation( + df=df_train, val_size=val_size, test_size=test_size, n_windows=None) + nf_preds.to_csv(os.path.join( + dataset, "cross_validation.csv"), index=False) + nf_preds = nf_preds.reset_index() # 保存模型 # 生成文件名,按时间 精确到分 filename = f'{settings}--{now}.joblib' - #文件名去掉冒号 - filename = filename.replace(':', '-') # 替换冒号 - dump(nf, os.path.join(dataset,filename)) + # 文件名去掉冒号 + filename = filename.replace(':', '-') # 替换冒号 + dump(nf, os.path.join(dataset, filename)) else: # glob获取dataset下最新的joblib文件 import glob - filename = max(glob.glob(os.path.join(dataset,'*.joblib')), key=os.path.getctime) - logger.info('读取模型:'+ filename) + filename = max(glob.glob(os.path.join( + dataset, '*.joblib')), key=os.path.getctime) + logger.info('读取模型:' + filename) nf = load(filename) # 测试集预测 - nf_test_preds = nf.cross_validation(df=df_test, val_size=val_size, test_size=test_size, n_windows=None) + nf_test_preds = nf.cross_validation( + df=df_test, val_size=val_size, test_size=test_size, n_windows=None) # 测试集预测结果保存 - nf_test_preds.to_csv(os.path.join(dataset,"cross_validation.csv"),index=False) + nf_test_preds.to_csv(os.path.join( + dataset, "cross_validation.csv"), index=False) df_test['ds'] = pd.to_datetime(df_test['ds'], errors='coerce') - #进行未来时间预测 - df_predict=nf.predict(df_test).reset_index() + # 进行未来时间预测 + df_predict = nf.predict(df_test).reset_index() # 去掉index列 if 'index' in df_predict.columns: df_predict.drop(columns=['index'], inplace=True) - df_predict.astype({col: 'float32' for col in df_predict.columns if col not in ['ds'] }) - + df_predict.astype( + {col: 'float32' for col in df_predict.columns if col not in ['ds']}) + # 添加预测时间 df_predict['created_dt'] = end_time - + # 保存预测值 - df_predict.to_csv(os.path.join(dataset,"predict.csv"),index=False) + df_predict.to_csv(os.path.join(dataset, "predict.csv"), index=False) # 将预测结果保存到数据库 - save_to_database(sqlitedb,df_predict,'predict',end_time) - + save_to_database(sqlitedb, df_predict, 'predict', end_time) + # 把预测值上传到eta if is_update_eta: df_predict['ds'] = pd.to_datetime(df_predict['ds']) @@ -217,22 +242,22 @@ def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patien for m in modelsindex.keys(): list = [] - for date,value in zip(dates,df_predict[m].round(2)): - list.append({'Date':date,'Value':value}) + for date, value in zip(dates, df_predict[m].round(2)): + list.append({'Date': date, 'Value': value}) data['DataList'] = list data['IndexCode'] = modelsindex[m] data['IndexName'] = f'价格预测{m}模型' data['Remark'] = m - etadata.push_data(data) - + etadata.push_data(data) # return nf_test_preds - return + return + @exception_logger -def ex_Model_Juxiting(df,horizon,input_size,train_steps,val_check_steps,early_stop_patience_steps, - is_debug,dataset,is_train,is_fivemodels,val_size,test_size,settings,now, - etadata,modelsindex,data,is_eta,end_time): +def ex_Model_Juxiting(df, horizon, input_size, train_steps, val_check_steps, early_stop_patience_steps, + is_debug, dataset, is_train, is_fivemodels, val_size, test_size, settings, now, + etadata, modelsindex, data, is_eta, end_time): ''' 模型训练与预测 :param df: 数据集 @@ -275,10 +300,11 @@ def ex_Model_Juxiting(df,horizon,input_size,train_steps,val_check_steps,early_st # 'DeepNPT', # ] - df= df.replace(',', '', regex=True) + df = df.replace(',', '', regex=True) df = df.rename(columns={'date': 'ds'}) df['y'] = pd.to_numeric(df['y'], errors='coerce') - df['ds'] = pd.to_datetime(df['ds'], errors='coerce') # 使用errors='coerce'来处理无效日期 + # 使用errors='coerce'来处理无效日期 + df['ds'] = pd.to_datetime(df['ds'], errors='coerce') # df 数值列转为 float32 for col in df.select_dtypes(include=['int']).columns: df[col] = df[col].astype(np.float32) @@ -287,8 +313,6 @@ def ex_Model_Juxiting(df,horizon,input_size,train_steps,val_check_steps,early_st plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 - - # 不筛选特征用下面的 df_reg = df df_reg.sort_values('ds', inplace=True) @@ -296,7 +320,7 @@ def ex_Model_Juxiting(df,horizon,input_size,train_steps,val_check_steps,early_st df_reg = df_reg[-1000:-1] # 计算训练集的结束索引,占总数据的90% - split_index = int(0.9* len(df_reg)) + split_index = int(0.9 * len(df_reg)) # 按照时间顺序划分训练集和测试集 df_train = df_reg[:split_index] @@ -311,46 +335,66 @@ def ex_Model_Juxiting(df,horizon,input_size,train_steps,val_check_steps,early_st logger.info("\nTesting set head:") logger.info(df_test.head()) - models = [ - NHITS (h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', activation='ReLU', early_stop_patience_steps=early_stop_patience_steps), - Informer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps ), - LSTM(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - iTransformer(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - TSMixer(h=horizon, input_size=input_size, n_series = 1, max_steps=train_steps, early_stop_patience_steps=early_stop_patience_steps), - TSMixerx(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, early_stop_patience_steps=early_stop_patience_steps), - PatchTST(h=horizon, input_size=input_size, max_steps=train_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - RNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - GRU(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - TCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + NHITS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', activation='ReLU', early_stop_patience_steps=early_stop_patience_steps), + Informer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + LSTM(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + iTransformer(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TSMixer(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, + early_stop_patience_steps=early_stop_patience_steps), + TSMixerx(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, + early_stop_patience_steps=early_stop_patience_steps), + PatchTST(h=horizon, input_size=input_size, max_steps=train_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + RNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + GRU(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), # DeepAR(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - BiTCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - DilatedRNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - MLP(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - DLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - NLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - TFT(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - FEDformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - StemGNN(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - MLPMultivariate(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - TiDE(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - DeepNPTS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), - + BiTCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DilatedRNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + MLP(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + NLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TFT(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + FEDformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + StemGNN(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + MLPMultivariate(h=horizon, input_size=input_size, n_series=1, max_steps=train_steps, + val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TiDE(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DeepNPTS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, + scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + # VanillaTransformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), //报错了 # Autoformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), //报错了 # NBEATS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), # NBEATSx (h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard',activation='ReLU', ), //报错 # HINT(h=horizon), - + ] if is_fivemodels: # 获取之前存好的最好的五个模型 - with open(os.path.join(dataset,'best_modelnames.txt'), 'r',encoding='utf-8') as f: + with open(os.path.join(dataset, 'best_modelnames.txt'), 'r', encoding='utf-8') as f: best_modelnames = f.readlines()[0] logger.info(f'获取本地最佳模型名称:{best_modelnames}') - - # 重新拼接models + + # 重新拼接models all_models = models models = [] for model in all_models: @@ -363,48 +407,54 @@ def ex_Model_Juxiting(df,horizon,input_size,train_steps,val_check_steps,early_st from joblib import dump, load if is_train: # 模型交叉验证 - nf_preds = nf.cross_validation(df=df_train, val_size=val_size, test_size=test_size, n_windows=None) - nf_preds.to_csv(os.path.join(dataset,"cross_validation.csv"),index=False) - + nf_preds = nf.cross_validation( + df=df_train, val_size=val_size, test_size=test_size, n_windows=None) + nf_preds.to_csv(os.path.join( + dataset, "cross_validation.csv"), index=False) + nf_preds = nf_preds.reset_index() # 保存模型 # 生成文件名,按时间 精确到分 filename = f'{settings}--{now}.joblib' - #文件名去掉冒号 - filename = filename.replace(':', '-') # 替换冒号 - dump(nf, os.path.join(dataset,filename)) + # 文件名去掉冒号 + filename = filename.replace(':', '-') # 替换冒号 + dump(nf, os.path.join(dataset, filename)) else: # glob获取dataset下最新的joblib文件 import glob - filename = max(glob.glob(os.path.join(dataset,'*.joblib')), key=os.path.getctime) - logger.info('读取模型:'+ filename) + filename = max(glob.glob(os.path.join( + dataset, '*.joblib')), key=os.path.getctime) + logger.info('读取模型:' + filename) nf = load(filename) # 测试集预测 - nf_test_preds = nf.cross_validation(df=df_test, val_size=val_size, test_size=test_size, n_windows=None) + nf_test_preds = nf.cross_validation( + df=df_test, val_size=val_size, test_size=test_size, n_windows=None) # 测试集预测结果保存 - nf_test_preds.to_csv(os.path.join(dataset,"cross_validation.csv"),index=False) + nf_test_preds.to_csv(os.path.join( + dataset, "cross_validation.csv"), index=False) df_test['ds'] = pd.to_datetime(df_test['ds'], errors='coerce') - #进行未来时间预测 - df_predict=nf.predict(df_test).reset_index() + # 进行未来时间预测 + df_predict = nf.predict(df_test).reset_index() # 去掉index列 if 'index' in df_predict.columns: df_predict.drop(columns=['index'], inplace=True) # 处理非有限值(NA 或 inf),将其替换为 0 df_predict = df_predict.fillna(0) df_predict = df_predict.replace([np.inf, -np.inf], 0) - df_predict.astype({col: 'int' for col in df_predict.columns if col not in ['ds'] }) - + df_predict.astype( + {col: 'int' for col in df_predict.columns if col not in ['ds']}) + # 添加预测时间 df_predict['created_dt'] = end_time - + # 保存预测值 - df_predict.to_csv(os.path.join(dataset,"predict.csv"),index=False) + df_predict.to_csv(os.path.join(dataset, "predict.csv"), index=False) # 将预测结果保存到数据库 - save_to_database(sqlitedb,df_predict,'predict',end_time) - + save_to_database(sqlitedb, df_predict, 'predict', end_time) + # 把预测值上传到eta if is_update_eta: df_predict['ds'] = pd.to_datetime(df_predict['ds']) @@ -412,59 +462,63 @@ def ex_Model_Juxiting(df,horizon,input_size,train_steps,val_check_steps,early_st for m in modelsindex.keys(): list = [] - for date,value in zip(dates,df_predict[m]): - list.append({'Date':date,'Value':value}) + for date, value in zip(dates, df_predict[m]): + list.append({'Date': date, 'Value': value}) data['DataList'] = [list[-1]] data['IndexCode'] = modelsindex[m] data['IndexName'] = f'聚烯烃价格预测{m}模型' data['Remark'] = m - etadata.push_data(data=data) - + etadata.push_data(data=data) # return nf_test_preds - return + return # 雍安环境预测评估指数 @exception_logger -def model_losss_yongan(sqlitedb,end_time,table_name_prefix): +def model_losss_yongan(sqlitedb, end_time, table_name_prefix): global dataset global rote - most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]] + most_model = [sqlitedb.select_data('most_model', columns=[ + 'most_common_model'], order_by='ds desc', limit=1).values[0][0]] most_model_name = most_model[0] # 预测数据处理 predict - # df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv")) + # df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv")) # df_combined = dateConvert(df_combined) - df_combined = sqlitedb.select_data('accuracy',where_condition=f"created_dt <= '{end_time}'") + df_combined = sqlitedb.select_data( + 'accuracy', where_condition=f"created_dt <= '{end_time}'") df_combined4 = df_combined.copy() # 备份df_combined,后面画图需要 # 删除缺失值大于80%的列 logger.info(df_combined.shape) df_combined = df_combined.loc[:, df_combined.isnull().mean() < 0.8] logger.info(df_combined.shape) - # 删除缺失值 - df_combined.dropna(inplace=True) + # 删除缺失值 + df_combined.dropna(inplace=True) logger.info(df_combined.shape) # 其他列转为数值类型 - df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['CREAT_DATE','ds','created_dt'] }) + df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in [ + 'CREAT_DATE', 'ds', 'created_dt']}) # 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值 - df_combined['max_cutoff'] = df_combined.groupby('ds')['CREAT_DATE'].transform('max') + df_combined['max_cutoff'] = df_combined.groupby( + 'ds')['CREAT_DATE'].transform('max') # 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列 - df_combined = df_combined[df_combined['CREAT_DATE'] == df_combined['max_cutoff']] + df_combined = df_combined[df_combined['CREAT_DATE'] + == df_combined['max_cutoff']] # 删除模型生成的cutoff列 - df_combined.drop(columns=['CREAT_DATE', 'max_cutoff','created_dt','min_within_quantile','max_within_quantile','id','min_price','max_price','LOW_PRICE','HIGH_PRICE','mean']) + df_combined.drop(columns=['CREAT_DATE', 'max_cutoff', 'created_dt', 'min_within_quantile', + 'max_within_quantile', 'id', 'min_price', 'max_price', 'LOW_PRICE', 'HIGH_PRICE', 'mean']) # 获取模型名称 - modelnames = df_combined.columns.to_list()[1:] + modelnames = df_combined.columns.to_list()[1:] if 'y' in modelnames: modelnames.remove('y') df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要 - # 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE cellText = [] - # 遍历模型名称,计算模型评估指标 + # 遍历模型名称,计算模型评估指标 for model in modelnames: modelmse = mse(df_combined['y'], df_combined[model]) modelrmse = rmse(df_combined['y'], df_combined[model]) @@ -472,12 +526,16 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): # modelmape = mape(df_combined['y'], df_combined[model]) # modelsmape = smape(df_combined['y'], df_combined[model]) # modelr2 = r2_score(df_combined['y'], df_combined[model]) - cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)]) - - model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) + cellText.append([model, round(modelmse, 3), round( + modelrmse, 3), round(modelmae, 3)]) + + model_results3 = pd.DataFrame( + cellText, columns=['模型(Model)', '平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) # 按MSE降序排列 - model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True) - model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False) + model_results3 = model_results3.sort_values( + by='平均平方误差(MSE)', ascending=True) + model_results3.to_csv(os.path.join( + dataset, "model_evaluation.csv"), index=False) modelnames = model_results3['模型(Model)'].tolist() allmodelnames = modelnames.copy() # 保存5个最佳模型的名称 @@ -486,13 +544,13 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): if is_fivemodels: pass else: - with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f: + with open(os.path.join(dataset, "best_modelnames.txt"), 'w') as f: f.write(','.join(modelnames) + '\n') # 预测值与真实值对比图 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.figure(figsize=(15, 10)) - for n,model in enumerate(modelnames[:5]): + for n, model in enumerate(modelnames[:5]): plt.subplot(3, 2, n+1) plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值') plt.plot(df_combined3['ds'], df_combined3[model], label=model) @@ -501,53 +559,60 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): plt.ylabel('价格') plt.title(model+'拟合') plt.subplots_adjust(hspace=0.5) - plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '预测值与真实值对比图.png'), bbox_inches='tight') plt.close() - - + # # 历史数据+预测数据 # # 拼接未来时间预测 - df_predict = pd.read_csv(os.path.join(dataset,'predict.csv')) - df_predict.drop('unique_id',inplace=True,axis=1) - df_predict.dropna(axis=1,inplace=True) + df_predict = pd.read_csv(os.path.join(dataset, 'predict.csv')) + df_predict.drop('unique_id', inplace=True, axis=1) + df_predict.dropna(axis=1, inplace=True) try: - df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d') - except ValueError : - df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d') - + df_predict['ds'] = pd.to_datetime(df_predict['ds'], format=r'%Y-%m-%d') + except ValueError: + df_predict['ds'] = pd.to_datetime(df_predict['ds'], format=r'%Y/%m/%d') + def first_row_to_database(df): # # 取第一行数据存储到数据库中 first_row = df.head(1) first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00') # 将预测结果保存到数据库 if not sqlitedb.check_table_exists('trueandpredict'): - first_row.to_sql('trueandpredict',sqlitedb.connection,index=False) + first_row.to_sql('trueandpredict', + sqlitedb.connection, index=False) else: for col in first_row.columns: - sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT') + sqlitedb.add_column_if_not_exists( + 'trueandpredict', col, 'TEXT') for row in first_row.itertuples(index=False): row_dict = row._asdict() - columns=row_dict.keys() - check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'") + columns = row_dict.keys() + check_query = sqlitedb.select_data( + 'trueandpredict', where_condition=f"ds = '{row.ds}'") if len(check_query) > 0: - set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) - sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'") + set_clause = ", ".join( + [f"{key} = '{value}'" for key, value in row_dict.items()]) + sqlitedb.update_data( + 'trueandpredict', set_clause, where_condition=f"ds = '{row.ds}'") continue - sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns) + sqlitedb.insert_data('trueandpredict', tuple( + row_dict.values()), columns=columns) first_row_to_database(df_predict) - + df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True) # 计算每个模型与最佳模型的绝对误差比例,根据设置的阈值rote筛选预测值显示最大最小值 names = [] names_df = df_combined3.copy() for col in allmodelnames: - names_df[f'{col}-{most_model_name}-误差比例'] = abs(names_df[col] - names_df[most_model_name]) / names_df[most_model_name] + names_df[f'{col}-{most_model_name}-误差比例'] = abs( + names_df[col] - names_df[most_model_name]) / names_df[most_model_name] names.append(f'{col}-{most_model_name}-误差比例') names_df = names_df[names] + def add_rote_column(row): columns = [] for r in names_df.columns: @@ -555,21 +620,22 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): columns.append(r.split('-')[0]) return pd.Series([columns], index=['columns']) names_df['columns'] = names_df.apply(add_rote_column, axis=1) - + def add_upper_lower_bound(row): print(row['columns']) print(type(row['columns'])) # 计算上边界值 - upper_bound = df_combined3.loc[row.name,row['columns']].max() + upper_bound = df_combined3.loc[row.name, row['columns']].max() # 计算下边界值 - lower_bound = df_combined3.loc[row.name,row['columns']].min() + lower_bound = df_combined3.loc[row.name, row['columns']].min() return pd.Series([lower_bound, upper_bound], index=['min_within_quantile', 'max_within_quantile']) - df_combined3[['min_within_quantile','max_within_quantile']] = names_df.apply(add_upper_lower_bound, axis=1) - + df_combined3[['min_within_quantile', 'max_within_quantile'] + ] = names_df.apply(add_upper_lower_bound, axis=1) + def find_closest_values(row): x = row.y if x is None or np.isnan(x): - return pd.Series([None, None], index=['min_price','max_price']) + return pd.Series([None, None], index=['min_price', 'max_price']) # row = row.drop('ds') row = row.values.tolist() row.sort() @@ -577,26 +643,28 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): # x 在row中的索引 index = row.index(x) if index == 0: - return pd.Series([row[index+1], row[index+2]], index=['min_price','max_price']) + return pd.Series([row[index+1], row[index+2]], index=['min_price', 'max_price']) elif index == len(row)-1: - return pd.Series([row[index-2], row[index-1]], index=['min_price','max_price']) + return pd.Series([row[index-2], row[index-1]], index=['min_price', 'max_price']) else: - return pd.Series([row[index-1], row[index+1]], index=['min_price','max_price']) + return pd.Series([row[index-1], row[index+1]], index=['min_price', 'max_price']) def find_most_common_model(): # 最多频率的模型名称 - min_model_max_frequency_model = df_combined3['min_model'].tail(60).value_counts().idxmax() - max_model_max_frequency_model = df_combined3['max_model'].tail(60).value_counts().idxmax() + min_model_max_frequency_model = df_combined3['min_model'].tail( + 60).value_counts().idxmax() + max_model_max_frequency_model = df_combined3['max_model'].tail( + 60).value_counts().idxmax() if min_model_max_frequency_model == max_model_max_frequency_model: # 取60天第二多的模型 - max_model_max_frequency_model = df_combined3['max_model'].tail(60).value_counts().nlargest(2).index[1] + max_model_max_frequency_model = df_combined3['max_model'].tail( + 60).value_counts().nlargest(2).index[1] df_predict['min_model'] = min_model_max_frequency_model df_predict['max_model'] = max_model_max_frequency_model df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model] df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model] - # find_most_common_model() df_combined3['ds'] = pd.to_datetime(df_combined3['ds']) @@ -605,47 +673,49 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): # 保存到数据库 if not sqlitedb.check_table_exists(f'{table_name_prefix}accuracy'): - columns = ','.join(df_combined3.columns.to_list()+['id','CREAT_DATE','min_price','max_price','LOW_PRICE','HIGH_PRICE','mean']) - sqlitedb.create_table('accuracy',columns=columns) - existing_data = sqlitedb.select_data(table_name = "accuracy") + columns = ','.join(df_combined3.columns.to_list( + )+['id', 'CREAT_DATE', 'min_price', 'max_price', 'LOW_PRICE', 'HIGH_PRICE', 'mean']) + sqlitedb.create_table('accuracy', columns=columns) + existing_data = sqlitedb.select_data(table_name="accuracy") - if not existing_data.empty: + if not existing_data.empty: max_id = existing_data['id'].astype(int).max() df_predict2['id'] = range(max_id + 1, max_id + 1 + len(df_predict2)) else: df_predict2['id'] = range(1, 1 + len(df_predict2)) df_predict2['CREAT_DATE'] = end_time - save_to_database(sqlitedb,df_predict2,"accuracy",end_time) + save_to_database(sqlitedb, df_predict2, "accuracy", end_time) # 上周准确率计算 - accuracy_df = sqlitedb.select_data(table_name = "accuracy") + accuracy_df = sqlitedb.select_data(table_name="accuracy") predict_y = accuracy_df.copy() # ids = predict_y[predict_y['min_price'].isnull()]['id'].tolist() ids = predict_y['id'].tolist() # 准确率基准与绘图上下界逻辑一致 # predict_y[['min_price','max_price']] = predict_y[['min_within_quantile','max_within_quantile']] - # 模型评估前五均值 + # 模型评估前五均值 # predict_y['min_price'] = predict_y[modelnames].mean(axis=1) -1 # predict_y['max_price'] = predict_y[modelnames].mean(axis=1) +1 - # 模型评估前十均值 - predict_y['min_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) -1.5 + # 模型评估前十均值 + predict_y['min_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) - 1.5 predict_y['mean'] = predict_y[allmodelnames[0:10]].mean(axis=1) - predict_y['max_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) +1.5 + predict_y['max_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) + 1.5 # 模型评估前十最大最小 # allmodelnames 和 predict_y 列 重复的 # allmodelnames = [col for col in allmodelnames if col in predict_y.columns] - # predict_y['min_price'] = predict_y[allmodelnames[0:10]].min(axis=1) + # predict_y['min_price'] = predict_y[allmodelnames[0:10]].min(axis=1) # predict_y['max_price'] = predict_y[allmodelnames[0:10]].max(axis=1) for id in ids: row = predict_y[predict_y['id'] == id] try: - sqlitedb.update_data('accuracy',f"min_price = {row['min_price'].values[0]},max_price = {row['max_price'].values[0]},mean={row['mean'].values[0]}",f"id = {id}") + sqlitedb.update_data( + 'accuracy', f"min_price = {row['min_price'].values[0]},max_price = {row['max_price'].values[0]},mean={row['mean'].values[0]}", f"id = {id}") except: logger.error(f'更新accuracy表中的min_price,max_price,mean值失败,row={row}') - + df = accuracy_df.copy() - df['ds'] = pd.to_datetime(df['ds']) + df['ds'] = pd.to_datetime(df['ds']) df = df.reindex() # 判断预测值在不在布伦特最高最低价范围内,准确率为1,否则为0 @@ -661,53 +731,62 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): # 比较真实最高最低,和预测最高最低 计算准确率 # 全子集情况: if (row['max_price'] >= row['HIGH_PRICE'] and row['min_price'] <= row['LOW_PRICE']) or \ - (row['max_price'] <= row['HIGH_PRICE'] and row['min_price'] >= row['LOW_PRICE']): - return 1 + (row['max_price'] <= row['HIGH_PRICE'] and row['min_price'] >= row['LOW_PRICE']): + return 1 # 无交集情况: if row['max_price'] < row['LOW_PRICE'] or \ - row['min_price'] > row['HIGH_PRICE']: + row['min_price'] > row['HIGH_PRICE']: return 0 # 有交集情况: else: - sorted_prices = sorted([row['LOW_PRICE'], row['min_price'], row['max_price'], row['HIGH_PRICE']]) + sorted_prices = sorted( + [row['LOW_PRICE'], row['min_price'], row['max_price'], row['HIGH_PRICE']]) middle_diff = sorted_prices[2] - sorted_prices[1] price_range = row['HIGH_PRICE'] - row['LOW_PRICE'] accuracy = middle_diff / price_range return accuracy - columns = ['HIGH_PRICE','LOW_PRICE','min_price','max_price'] + columns = ['HIGH_PRICE', 'LOW_PRICE', 'min_price', 'max_price'] df[columns] = df[columns].astype(float) df['ACCURACY'] = df.apply(calculate_accuracy, axis=1) # df['ACCURACY'] = df.apply(is_within_range, axis=1) # 计算准确率并保存结果 - def _get_accuracy_rate(df,create_dates,ds_dates,endtime): + def _get_accuracy_rate(df, create_dates, ds_dates, endtime): df3 = df.copy() df3 = df3[df3['CREAT_DATE'].isin(create_dates)] df3 = df3[df3['ds'].isin(ds_dates)] accuracy_rote = 0 - for i,group in df3.groupby('CREAT_DATE'): - accuracy_rote += (group['ACCURACY'].sum()/len(group))*weight_dict[len(group)-1] - accuracy_rote = round(accuracy_rote,2) - df4 = pd.DataFrame(columns=['开始日期','结束日期','准确率']) - df4.loc[len(df4)] = {'开始日期':ds_dates[0],'结束日期':ds_dates[-1],'准确率':accuracy_rote} - df4.to_sql("accuracy_rote", con=sqlitedb.connection, if_exists='append', index=False) - create_dates,ds_dates = get_week_date(end_time) - _get_accuracy_rate(df,create_dates,ds_dates,end_time) - + for i, group in df3.groupby('CREAT_DATE'): + accuracy_rote += (group['ACCURACY'].sum() / + len(group))*weight_dict[len(group)-1] + accuracy_rote = round(accuracy_rote, 2) + df4 = pd.DataFrame(columns=['开始日期', '结束日期', '准确率']) + df4.loc[len(df4)] = {'开始日期': ds_dates[0], + '结束日期': ds_dates[-1], '准确率': accuracy_rote} + df4.to_sql("accuracy_rote", con=sqlitedb.connection, + if_exists='append', index=False) + create_dates, ds_dates = get_week_date(end_time) + _get_accuracy_rate(df, create_dates, ds_dates, end_time) + def _add_abs_error_rate(): # 计算每个预测值与真实值之间的偏差率 for model in allmodelnames: - df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y'] + df_combined3[f'{model}_abs_error_rate'] = abs( + df_combined3['y'] - df_combined3[model]) / df_combined3['y'] # 获取每行对应的最小偏差率值 - min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) + min_abs_error_rate_values = df_combined3.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) # 获取每行对应的最小偏差率值对应的列名 - min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) + min_abs_error_rate_column_name = df_combined3.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) # 将列名索引转换为列名 - min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0]) + min_abs_error_rate_column_name = min_abs_error_rate_column_name.map( + lambda x: x.split('_')[0]) # 获取最小偏差率对应的模型的预测值 - min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) + min_abs_error_rate_predictions = df_combined3.apply( + lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) # 将最小偏差率对应的模型的预测值添加到DataFrame中 df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name @@ -721,25 +800,28 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): df_combined3[col] = df_combined3[col].round(2) except ValueError: pass - df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False) - - - # 历史价格+预测价格 + df_combined3.to_csv(os.path.join( + dataset, "testandpredict_groupby.csv"), index=False) + + # 历史价格+预测价格 sqlitedb.drop_table('testandpredict_groupby') - df_combined3.to_sql('testandpredict_groupby',sqlitedb.connection,index=False) + df_combined3.to_sql('testandpredict_groupby', + sqlitedb.connection, index=False) def _plt_predict_ture(df): lens = df.shape[0] if df.shape[0] < 180 else 90 - df = df[-lens:] # 取180个数据点画图 + df = df[-lens:] # 取180个数据点画图 # 历史价格 plt.figure(figsize=(20, 10)) plt.plot(df['ds'], df['y'], label='真实值') # 颜色填充 - plt.fill_between(df['ds'], df['max_within_quantile'], df['min_within_quantile'], alpha=0.2) + plt.fill_between(df['ds'], df['max_within_quantile'], + df['min_within_quantile'], alpha=0.2) markers = ['o', 's', '^', 'D', 'v', '*', 'p', 'h', 'H', '+', 'x', 'd'] random_marker = random.choice(markers) for model in modelnames: - plt.plot(df['ds'][-horizon:], df[model][-horizon:], label=model,marker=random_marker) + plt.plot(df['ds'][-horizon:], df[model][-horizon:], + label=model, marker=random_marker) # plt.plot(df_combined3['ds'], df_combined3['min_abs_error_rate_prediction'], label='最小绝对误差', linestyle='--', color='orange') # 网格 plt.grid(True) @@ -752,21 +834,21 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): plt.legend() plt.xlabel('日期') plt.ylabel('价格') - - plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight') + + plt.savefig(os.path.join(dataset, '历史价格-预测值.png'), bbox_inches='tight') plt.close() - def _plt_modeltopten_predict_ture(df): df['max_cutoff'] = df.groupby('ds')['CREAT_DATE'].transform('max') df = df[df['CREAT_DATE'] == df['max_cutoff']] df['mean'] = df['mean'].astype(float) lens = df.shape[0] if df.shape[0] < 180 else 180 - df = df[-lens:] # 取180个数据点画图 + df = df[-lens:] # 取180个数据点画图 # 历史价格 plt.figure(figsize=(20, 10)) plt.plot(df['ds'], df['y'], label='真实值') - plt.plot(df['ds'], df['mean'], label='模型前十均值', linestyle='--', color='orange') + plt.plot(df['ds'], df['mean'], label='模型前十均值', + linestyle='--', color='orange') # 颜色填充 plt.fill_between(df['ds'], df['max_price'], df['min_price'], alpha=0.2) # markers = ['o', 's', '^', 'D', 'v', '*', 'p', 'h', 'H', '+', 'x', 'd'] @@ -786,42 +868,44 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): plt.legend() plt.xlabel('日期') plt.ylabel('价格') - - plt.savefig(os.path.join(dataset,'历史价格-预测值1.png'), bbox_inches='tight') + + plt.savefig(os.path.join(dataset, '历史价格-预测值1.png'), + bbox_inches='tight') plt.close() - - def _plt_predict_table(df): + def _plt_predict_table(df): # 预测值表格 fig, ax = plt.subplots(figsize=(20, 6)) ax.axis('off') # 关闭坐标轴 # 数值保留2位小数 df = df.round(2) df = df[-horizon:] - df['Day'] = [f'Day_{i}' for i in range(1,horizon+1)] + df['Day'] = [f'Day_{i}' for i in range(1, horizon+1)] # Day列放到最前面 df = df[['Day'] + list(df.columns[:-1])] - table = ax.table(cellText=df.values, colLabels=df.columns, loc='center') - #加宽表格 - table.auto_set_font_size(False) - table.set_fontsize(10) - - # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight') - plt.close() - - def _plt_model_results3(): - # 可视化评估结果 - plt.rcParams['font.sans-serif'] = ['SimHei'] - fig, ax = plt.subplots(figsize=(20, 10)) - ax.axis('off') # 关闭坐标轴 - table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center') + table = ax.table(cellText=df.values, + colLabels=df.columns, loc='center') # 加宽表格 table.auto_set_font_size(False) table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '预测值表格.png'), bbox_inches='tight') + plt.close() + + def _plt_model_results3(): + # 可视化评估结果 + plt.rcParams['font.sans-serif'] = ['SimHei'] + fig, ax = plt.subplots(figsize=(20, 10)) + ax.axis('off') # 关闭坐标轴 + table = ax.table(cellText=model_results3.values, + colLabels=model_results3.columns, loc='center') + # 加宽表格 + table.auto_set_font_size(False) + table.set_fontsize(10) + + # 设置表格样式,列数据最小的用绿色标识 + plt.savefig(os.path.join(dataset, '模型评估.png'), bbox_inches='tight') plt.close() _plt_predict_ture(df_combined3) @@ -830,28 +914,30 @@ def model_losss_yongan(sqlitedb,end_time,table_name_prefix): _plt_model_results3() return model_results3 - + # 原油计算预测评估指数 @exception_logger -def model_losss(sqlitedb,end_time): +def model_losss(sqlitedb, end_time): global dataset global rote # 从数据库取最佳模型,如果没有表,先自定义空,后面根据模型评估取第一个 try: - most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]] + most_model = [sqlitedb.select_data('most_model', columns=[ + 'most_common_model'], order_by='ds desc', limit=1).values[0][0]] most_model_name = most_model[0] except: most_model_name = '' # 预测数据处理 predict - + try: - df_combined = sqlitedb.select_data('accuracy',where_condition=f"created_dt <= '{end_time}'") + df_combined = sqlitedb.select_data( + 'accuracy', where_condition=f"created_dt <= '{end_time}'") if len(df_combined) < 10: len(df_combined) + '' except: - df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv")) + df_combined = loadcsv(os.path.join(dataset, "cross_validation.csv")) df_combined = dateConvert(df_combined) df_combined['CREAT_DATE'] = df_combined['cutoff'] df_combined4 = df_combined.copy() # 备份df_combined,后面画图需要 @@ -859,31 +945,34 @@ def model_losss(sqlitedb,end_time): logger.info(df_combined.shape) df_combined = df_combined.loc[:, df_combined.isnull().mean() < 0.8] logger.info(df_combined.shape) - # 删除缺失值 - df_combined.dropna(inplace=True) + # 删除缺失值 + df_combined.dropna(inplace=True) logger.info(df_combined.shape) # 其他列转为数值类型 - df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['CREAT_DATE','ds','created_dt','cutoff'] }) + df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in [ + 'CREAT_DATE', 'ds', 'created_dt', 'cutoff']}) # 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值 - df_combined['max_cutoff'] = df_combined.groupby('ds')['CREAT_DATE'].transform('max') + df_combined['max_cutoff'] = df_combined.groupby( + 'ds')['CREAT_DATE'].transform('max') # 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列 - df_combined = df_combined[df_combined['CREAT_DATE'] == df_combined['max_cutoff']] + df_combined = df_combined[df_combined['CREAT_DATE'] + == df_combined['max_cutoff']] # 删除模型生成的cutoff列 - df_combined.drop(columns=['CREAT_DATE', 'max_cutoff','created_dt','min_within_quantile','max_within_quantile','id','min_price','max_price','LOW_PRICE','HIGH_PRICE','mean','cutoff'], inplace=True,errors='ignore') + df_combined.drop(columns=['CREAT_DATE', 'max_cutoff', 'created_dt', 'min_within_quantile', 'max_within_quantile', + 'id', 'min_price', 'max_price', 'LOW_PRICE', 'HIGH_PRICE', 'mean', 'cutoff'], inplace=True, errors='ignore') # 获取模型名称 - modelnames = df_combined.columns.to_list()[1:] + modelnames = df_combined.columns.to_list()[1:] if 'y' in modelnames: modelnames.remove('y') if 'cutoff' in modelnames: modelnames.remove('cutoff') df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要 - # 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE cellText = [] - # 遍历模型名称,计算模型评估指标 + # 遍历模型名称,计算模型评估指标 for model in modelnames: modelmse = mse(df_combined['y'], df_combined[model]) modelrmse = rmse(df_combined['y'], df_combined[model]) @@ -891,12 +980,16 @@ def model_losss(sqlitedb,end_time): # modelmape = mape(df_combined['y'], df_combined[model]) # modelsmape = smape(df_combined['y'], df_combined[model]) # modelr2 = r2_score(df_combined['y'], df_combined[model]) - cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)]) - - model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) + cellText.append([model, round(modelmse, 3), round( + modelrmse, 3), round(modelmae, 3)]) + + model_results3 = pd.DataFrame( + cellText, columns=['模型(Model)', '平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) # 按MSE降序排列 - model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True) - model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False) + model_results3 = model_results3.sort_values( + by='平均平方误差(MSE)', ascending=True) + model_results3.to_csv(os.path.join( + dataset, "model_evaluation.csv"), index=False) modelnames = model_results3['模型(Model)'].tolist() most_model_name = modelnames[0] allmodelnames = modelnames.copy() @@ -906,13 +999,13 @@ def model_losss(sqlitedb,end_time): if is_fivemodels: pass else: - with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f: + with open(os.path.join(dataset, "best_modelnames.txt"), 'w') as f: f.write(','.join(modelnames) + '\n') # 预测值与真实值对比图 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.figure(figsize=(15, 10)) - for n,model in enumerate(modelnames[:5]): + for n, model in enumerate(modelnames[:5]): plt.subplot(3, 2, n+1) plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值') plt.plot(df_combined3['ds'], df_combined3[model], label=model) @@ -921,53 +1014,60 @@ def model_losss(sqlitedb,end_time): plt.ylabel('价格') plt.title(model+'拟合') plt.subplots_adjust(hspace=0.5) - plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '预测值与真实值对比图.png'), bbox_inches='tight') plt.close() - - + # # 历史数据+预测数据 # # 拼接未来时间预测 - df_predict = pd.read_csv(os.path.join(dataset,'predict.csv')) - df_predict.drop('unique_id',inplace=True,axis=1) - df_predict.dropna(axis=1,inplace=True) + df_predict = pd.read_csv(os.path.join(dataset, 'predict.csv')) + df_predict.drop('unique_id', inplace=True, axis=1) + df_predict.dropna(axis=1, inplace=True) try: - df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d') - except ValueError : - df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d') - + df_predict['ds'] = pd.to_datetime(df_predict['ds'], format=r'%Y-%m-%d') + except ValueError: + df_predict['ds'] = pd.to_datetime(df_predict['ds'], format=r'%Y/%m/%d') + def first_row_to_database(df): # # 取第一行数据存储到数据库中 first_row = df.head(1) first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00') # 将预测结果保存到数据库 if not sqlitedb.check_table_exists('trueandpredict'): - first_row.to_sql('trueandpredict',sqlitedb.connection,index=False) + first_row.to_sql('trueandpredict', + sqlitedb.connection, index=False) else: for col in first_row.columns: - sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT') + sqlitedb.add_column_if_not_exists( + 'trueandpredict', col, 'TEXT') for row in first_row.itertuples(index=False): row_dict = row._asdict() - columns=row_dict.keys() - check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'") + columns = row_dict.keys() + check_query = sqlitedb.select_data( + 'trueandpredict', where_condition=f"ds = '{row.ds}'") if len(check_query) > 0: - set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) - sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'") + set_clause = ", ".join( + [f"{key} = '{value}'" for key, value in row_dict.items()]) + sqlitedb.update_data( + 'trueandpredict', set_clause, where_condition=f"ds = '{row.ds}'") continue - sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns) + sqlitedb.insert_data('trueandpredict', tuple( + row_dict.values()), columns=columns) first_row_to_database(df_predict) - + df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True) # 计算每个模型与最佳模型的绝对误差比例,根据设置的阈值rote筛选预测值显示最大最小值 names = [] names_df = df_combined3.copy() for col in allmodelnames: - names_df[f'{col}-{most_model_name}-误差比例'] = abs(names_df[col] - names_df[most_model_name]) / names_df[most_model_name] + names_df[f'{col}-{most_model_name}-误差比例'] = abs( + names_df[col] - names_df[most_model_name]) / names_df[most_model_name] names.append(f'{col}-{most_model_name}-误差比例') names_df = names_df[names] + def add_rote_column(row): columns = [] for r in names_df.columns: @@ -975,21 +1075,22 @@ def model_losss(sqlitedb,end_time): columns.append(r.split('-')[0]) return pd.Series([columns], index=['columns']) names_df['columns'] = names_df.apply(add_rote_column, axis=1) - + def add_upper_lower_bound(row): print(row['columns']) print(type(row['columns'])) # 计算上边界值 - upper_bound = df_combined3.loc[row.name,row['columns']].max() + upper_bound = df_combined3.loc[row.name, row['columns']].max() # 计算下边界值 - lower_bound = df_combined3.loc[row.name,row['columns']].min() + lower_bound = df_combined3.loc[row.name, row['columns']].min() return pd.Series([lower_bound, upper_bound], index=['min_within_quantile', 'max_within_quantile']) - df_combined3[['min_within_quantile','max_within_quantile']] = names_df.apply(add_upper_lower_bound, axis=1) - + df_combined3[['min_within_quantile', 'max_within_quantile'] + ] = names_df.apply(add_upper_lower_bound, axis=1) + def find_closest_values(row): x = row.y if x is None or np.isnan(x): - return pd.Series([None, None], index=['min_price','max_price']) + return pd.Series([None, None], index=['min_price', 'max_price']) # row = row.drop('ds') row = row.values.tolist() row.sort() @@ -997,26 +1098,28 @@ def model_losss(sqlitedb,end_time): # x 在row中的索引 index = row.index(x) if index == 0: - return pd.Series([row[index+1], row[index+2]], index=['min_price','max_price']) + return pd.Series([row[index+1], row[index+2]], index=['min_price', 'max_price']) elif index == len(row)-1: - return pd.Series([row[index-2], row[index-1]], index=['min_price','max_price']) + return pd.Series([row[index-2], row[index-1]], index=['min_price', 'max_price']) else: - return pd.Series([row[index-1], row[index+1]], index=['min_price','max_price']) + return pd.Series([row[index-1], row[index+1]], index=['min_price', 'max_price']) def find_most_common_model(): # 最多频率的模型名称 - min_model_max_frequency_model = df_combined3['min_model'].tail(60).value_counts().idxmax() - max_model_max_frequency_model = df_combined3['max_model'].tail(60).value_counts().idxmax() + min_model_max_frequency_model = df_combined3['min_model'].tail( + 60).value_counts().idxmax() + max_model_max_frequency_model = df_combined3['max_model'].tail( + 60).value_counts().idxmax() if min_model_max_frequency_model == max_model_max_frequency_model: # 取60天第二多的模型 - max_model_max_frequency_model = df_combined3['max_model'].tail(60).value_counts().nlargest(2).index[1] + max_model_max_frequency_model = df_combined3['max_model'].tail( + 60).value_counts().nlargest(2).index[1] df_predict['min_model'] = min_model_max_frequency_model df_predict['max_model'] = max_model_max_frequency_model df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model] df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model] - # find_most_common_model() df_combined3['ds'] = pd.to_datetime(df_combined3['ds']) @@ -1025,47 +1128,49 @@ def model_losss(sqlitedb,end_time): # 保存到数据库 if not sqlitedb.check_table_exists('accuracy'): - columns = ','.join(df_combined3.columns.to_list()+['id','CREAT_DATE','min_price','max_price','LOW_PRICE','HIGH_PRICE','mean']) - sqlitedb.create_table('accuracy',columns=columns) - existing_data = sqlitedb.select_data(table_name = "accuracy") + columns = ','.join(df_combined3.columns.to_list( + )+['id', 'CREAT_DATE', 'min_price', 'max_price', 'LOW_PRICE', 'HIGH_PRICE', 'mean']) + sqlitedb.create_table('accuracy', columns=columns) + existing_data = sqlitedb.select_data(table_name="accuracy") - if not existing_data.empty: + if not existing_data.empty: max_id = existing_data['id'].astype(int).max() df_predict2['id'] = range(max_id + 1, max_id + 1 + len(df_predict2)) else: df_predict2['id'] = range(1, 1 + len(df_predict2)) df_predict2['CREAT_DATE'] = end_time - save_to_database(sqlitedb,df_predict2,"accuracy",end_time) + save_to_database(sqlitedb, df_predict2, "accuracy", end_time) # 上周准确率计算 - accuracy_df = sqlitedb.select_data(table_name = "accuracy") + accuracy_df = sqlitedb.select_data(table_name="accuracy") predict_y = accuracy_df.copy() # ids = predict_y[predict_y['min_price'].isnull()]['id'].tolist() ids = predict_y['id'].tolist() # 准确率基准与绘图上下界逻辑一致 # predict_y[['min_price','max_price']] = predict_y[['min_within_quantile','max_within_quantile']] - # 模型评估前五均值 + # 模型评估前五均值 # predict_y['min_price'] = predict_y[modelnames].mean(axis=1) -1 # predict_y['max_price'] = predict_y[modelnames].mean(axis=1) +1 - # 模型评估前十均值 - predict_y['min_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) -1.5 + # 模型评估前十均值 + predict_y['min_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) - 1.5 predict_y['mean'] = predict_y[allmodelnames[0:10]].mean(axis=1) - predict_y['max_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) +1.5 + predict_y['max_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) + 1.5 # 模型评估前十最大最小 # allmodelnames 和 predict_y 列 重复的 # allmodelnames = [col for col in allmodelnames if col in predict_y.columns] - # predict_y['min_price'] = predict_y[allmodelnames[0:10]].min(axis=1) + # predict_y['min_price'] = predict_y[allmodelnames[0:10]].min(axis=1) # predict_y['max_price'] = predict_y[allmodelnames[0:10]].max(axis=1) for id in ids: row = predict_y[predict_y['id'] == id] try: - sqlitedb.update_data('accuracy',f"min_price = {row['min_price'].values[0]},max_price = {row['max_price'].values[0]},mean={row['mean'].values[0]}",f"id = {id}") + sqlitedb.update_data( + 'accuracy', f"min_price = {row['min_price'].values[0]},max_price = {row['max_price'].values[0]},mean={row['mean'].values[0]}", f"id = {id}") except: logger.error(f'更新accuracy表中的min_price,max_price,mean值失败,row={row}') - + df = accuracy_df.copy() - df['ds'] = pd.to_datetime(df['ds']) + df['ds'] = pd.to_datetime(df['ds']) df = df.reindex() # 判断预测值在不在布伦特最高最低价范围内,准确率为1,否则为0 @@ -1081,53 +1186,62 @@ def model_losss(sqlitedb,end_time): # 比较真实最高最低,和预测最高最低 计算准确率 # 全子集情况: if (row['max_price'] >= row['HIGH_PRICE'] and row['min_price'] <= row['LOW_PRICE']) or \ - (row['max_price'] <= row['HIGH_PRICE'] and row['min_price'] >= row['LOW_PRICE']): - return 1 + (row['max_price'] <= row['HIGH_PRICE'] and row['min_price'] >= row['LOW_PRICE']): + return 1 # 无交集情况: if row['max_price'] < row['LOW_PRICE'] or \ - row['min_price'] > row['HIGH_PRICE']: + row['min_price'] > row['HIGH_PRICE']: return 0 # 有交集情况: else: - sorted_prices = sorted([row['LOW_PRICE'], row['min_price'], row['max_price'], row['HIGH_PRICE']]) + sorted_prices = sorted( + [row['LOW_PRICE'], row['min_price'], row['max_price'], row['HIGH_PRICE']]) middle_diff = sorted_prices[2] - sorted_prices[1] price_range = row['HIGH_PRICE'] - row['LOW_PRICE'] accuracy = middle_diff / price_range return accuracy - columns = ['HIGH_PRICE','LOW_PRICE','min_price','max_price'] + columns = ['HIGH_PRICE', 'LOW_PRICE', 'min_price', 'max_price'] df[columns] = df[columns].astype(float) df['ACCURACY'] = df.apply(calculate_accuracy, axis=1) # df['ACCURACY'] = df.apply(is_within_range, axis=1) # 计算准确率并保存结果 - def _get_accuracy_rate(df,create_dates,ds_dates): + def _get_accuracy_rate(df, create_dates, ds_dates): df3 = df.copy() df3 = df3[df3['CREAT_DATE'].isin(create_dates)] df3 = df3[df3['ds'].isin(ds_dates)] accuracy_rote = 0 - for i,group in df3.groupby('CREAT_DATE'): - accuracy_rote += (group['ACCURACY'].sum()/len(group))*weight_dict[len(group)-1] - accuracy_rote = round(accuracy_rote,2) - df4 = pd.DataFrame(columns=['开始日期','结束日期','准确率']) - df4.loc[len(df4)] = {'开始日期':ds_dates[0],'结束日期':ds_dates[-1],'准确率':accuracy_rote} - df4.to_sql("accuracy_rote", con=sqlitedb.connection, if_exists='append', index=False) - create_dates,ds_dates = get_week_date(end_time) - _get_accuracy_rate(df,create_dates,ds_dates) - + for i, group in df3.groupby('CREAT_DATE'): + accuracy_rote += (group['ACCURACY'].sum() / + len(group))*weight_dict[len(group)-1] + accuracy_rote = round(accuracy_rote, 2) + df4 = pd.DataFrame(columns=['开始日期', '结束日期', '准确率']) + df4.loc[len(df4)] = {'开始日期': ds_dates[0], + '结束日期': ds_dates[-1], '准确率': accuracy_rote} + df4.to_sql("accuracy_rote", con=sqlitedb.connection, + if_exists='append', index=False) + create_dates, ds_dates = get_week_date(end_time) + _get_accuracy_rate(df, create_dates, ds_dates) + def _add_abs_error_rate(): # 计算每个预测值与真实值之间的偏差率 for model in allmodelnames: - df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y'] + df_combined3[f'{model}_abs_error_rate'] = abs( + df_combined3['y'] - df_combined3[model]) / df_combined3['y'] # 获取每行对应的最小偏差率值 - min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) + min_abs_error_rate_values = df_combined3.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) # 获取每行对应的最小偏差率值对应的列名 - min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) + min_abs_error_rate_column_name = df_combined3.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) # 将列名索引转换为列名 - min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0]) + min_abs_error_rate_column_name = min_abs_error_rate_column_name.map( + lambda x: x.split('_')[0]) # 获取最小偏差率对应的模型的预测值 - min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) + min_abs_error_rate_predictions = df_combined3.apply( + lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) # 将最小偏差率对应的模型的预测值添加到DataFrame中 df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name @@ -1141,24 +1255,26 @@ def model_losss(sqlitedb,end_time): df_combined3[col] = df_combined3[col].round(2) except ValueError: pass - df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False) - - - # 历史价格+预测价格 + df_combined3.to_csv(os.path.join( + dataset, "testandpredict_groupby.csv"), index=False) + + # 历史价格+预测价格 sqlitedb.drop_table('testandpredict_groupby') - df_combined3.to_sql('testandpredict_groupby',sqlitedb.connection,index=False) + df_combined3.to_sql('testandpredict_groupby', + sqlitedb.connection, index=False) def _plt_predict_ture(df): lens = df.shape[0] if df.shape[0] < 180 else 90 - df = df[-lens:] # 取180个数据点画图 + df = df[-lens:] # 取180个数据点画图 # 历史价格 plt.figure(figsize=(20, 10)) # 时间格式更改 df['ds'] = pd.to_datetime(df['ds']) - + plt.plot(df['ds'], df['y'], label='真实值') # 颜色填充 - plt.fill_between(df['ds'], df['max_within_quantile'], df['min_within_quantile'], alpha=0.2) + plt.fill_between(df['ds'], df['max_within_quantile'], + df['min_within_quantile'], alpha=0.2) # markers = ['o', 's', '^', 'D', 'v', '*', 'p', 'h', 'H', '+', 'x', 'd'] # random_marker = random.choice(markers) # for model in allmodelnames: @@ -1173,7 +1289,7 @@ def model_losss(sqlitedb,end_time): # for model in most_model: # plt.plot(df['ds'], df[model], label=model,marker='o') - plt.plot(df['ds'], df[most_model_name], label=model,marker='o') + plt.plot(df['ds'], df[most_model_name], label=model, marker='o') # 当前日期画竖虚线 plt.axvline(x=df['ds'].iloc[-horizon], color='r', linestyle='--') plt.legend() @@ -1184,22 +1300,22 @@ def model_losss(sqlitedb,end_time): plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator()) plt.xticks(rotation=45) # 日期标签旋转45度,防止重叠 plt.ylabel('价格') - - plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight') + + plt.savefig(os.path.join(dataset, '历史价格-预测值.png'), bbox_inches='tight') plt.close() - def _plt_modeltopten_predict_ture(df): df['ds'] = pd.to_datetime(df['ds']) df['max_cutoff'] = df.groupby('ds')['CREAT_DATE'].transform('max') df = df[df['CREAT_DATE'] == df['max_cutoff']] df['mean'] = df['mean'].astype(float) lens = df.shape[0] if df.shape[0] < 180 else 180 - df = df[-lens:] # 取180个数据点画图 + df = df[-lens:] # 取180个数据点画图 # 历史价格 plt.figure(figsize=(20, 10)) plt.plot(df['ds'], df['y'], label='真实值') - plt.plot(df['ds'], df['mean'], label='模型前十均值', linestyle='--', color='orange') + plt.plot(df['ds'], df['mean'], label='模型前十均值', + linestyle='--', color='orange') # 颜色填充 plt.fill_between(df['ds'], df['max_price'], df['min_price'], alpha=0.2) # markers = ['o', 's', '^', 'D', 'v', '*', 'p', 'h', 'H', '+', 'x', 'd'] @@ -1221,44 +1337,46 @@ def model_losss(sqlitedb,end_time): # 自动设置横轴日期显示 plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator()) plt.xticks(rotation=45) # 日期标签旋转45度,防止重叠 - + plt.ylabel('价格') - - plt.savefig(os.path.join(dataset,'历史价格-预测值1.png'), bbox_inches='tight') + + plt.savefig(os.path.join(dataset, '历史价格-预测值1.png'), + bbox_inches='tight') plt.close() - - def _plt_predict_table(df): + def _plt_predict_table(df): # 预测值表格 fig, ax = plt.subplots(figsize=(20, 6)) ax.axis('off') # 关闭坐标轴 # 数值保留2位小数 df = df.round(2) df = df[-horizon:] - df['Day'] = [f'Day_{i}' for i in range(1,horizon+1)] + df['Day'] = [f'Day_{i}' for i in range(1, horizon+1)] # Day列放到最前面 df = df[['Day'] + list(df.columns[:-1])] - table = ax.table(cellText=df.values, colLabels=df.columns, loc='center') - #加宽表格 - table.auto_set_font_size(False) - table.set_fontsize(10) - - # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight') - plt.close() - - def _plt_model_results3(): - # 可视化评估结果 - plt.rcParams['font.sans-serif'] = ['SimHei'] - fig, ax = plt.subplots(figsize=(20, 10)) - ax.axis('off') # 关闭坐标轴 - table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center') + table = ax.table(cellText=df.values, + colLabels=df.columns, loc='center') # 加宽表格 table.auto_set_font_size(False) table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '预测值表格.png'), bbox_inches='tight') + plt.close() + + def _plt_model_results3(): + # 可视化评估结果 + plt.rcParams['font.sans-serif'] = ['SimHei'] + fig, ax = plt.subplots(figsize=(20, 10)) + ax.axis('off') # 关闭坐标轴 + table = ax.table(cellText=model_results3.values, + colLabels=model_results3.columns, loc='center') + # 加宽表格 + table.auto_set_font_size(False) + table.set_fontsize(10) + + # 设置表格样式,列数据最小的用绿色标识 + plt.savefig(os.path.join(dataset, '模型评估.png'), bbox_inches='tight') plt.close() # _plt_predict_ture(df_combined3) @@ -1271,15 +1389,16 @@ def model_losss(sqlitedb,end_time): # 聚烯烃计算预测评估指数 @exception_logger -def model_losss_juxitingbak(sqlitedb,end_time): +def model_losss_juxitingbak(sqlitedb, end_time): global dataset global rote - most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]] + most_model = [sqlitedb.select_data('most_model', columns=[ + 'most_common_model'], order_by='ds desc', limit=1).values[0][0]] most_model_name = most_model[0] # 预测数据处理 predict - df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv")) - df_combined.drop(columns=['cutoff'],inplace=True) + df_combined = loadcsv(os.path.join(dataset, "cross_validation.csv")) + df_combined.drop(columns=['cutoff'], inplace=True) df_combined['CREAT_DATE'] = end_time df_combined = dateConvert(df_combined) # df_combined = sqlitedb.select_data('accuracy',where_condition=f"created_dt <= '{end_time}'") @@ -1288,29 +1407,32 @@ def model_losss_juxitingbak(sqlitedb,end_time): logger.info(df_combined.shape) df_combined = df_combined.loc[:, df_combined.isnull().mean() < 0.8] logger.info(df_combined.shape) - # 删除缺失值 - df_combined.dropna(inplace=True) + # 删除缺失值 + df_combined.dropna(inplace=True) logger.info(df_combined.shape) # 其他列转为数值类型 - df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['CREAT_DATE','ds','created_dt'] }) + df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in [ + 'CREAT_DATE', 'ds', 'created_dt']}) # 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值 - df_combined['max_cutoff'] = df_combined.groupby('ds')['CREAT_DATE'].transform('max') + df_combined['max_cutoff'] = df_combined.groupby( + 'ds')['CREAT_DATE'].transform('max') # 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列 - df_combined = df_combined[df_combined['CREAT_DATE'] == df_combined['max_cutoff']] + df_combined = df_combined[df_combined['CREAT_DATE'] + == df_combined['max_cutoff']] # 删除模型生成的cutoff列 - df_combined.drop(columns=['CREAT_DATE', 'max_cutoff','created_dt','min_within_quantile','max_within_quantile','id','min_price','max_price','LOW_PRICE','HIGH_PRICE','mean'], inplace=True) + df_combined.drop(columns=['CREAT_DATE', 'max_cutoff', 'created_dt', 'min_within_quantile', + 'max_within_quantile', 'id', 'min_price', 'max_price', 'LOW_PRICE', 'HIGH_PRICE', 'mean'], inplace=True) # 获取模型名称 - modelnames = df_combined.columns.to_list()[1:] + modelnames = df_combined.columns.to_list()[1:] if 'y' in modelnames: modelnames.remove('y') df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要 - # 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE cellText = [] - # 遍历模型名称,计算模型评估指标 + # 遍历模型名称,计算模型评估指标 for model in modelnames: modelmse = mse(df_combined['y'], df_combined[model]) modelrmse = rmse(df_combined['y'], df_combined[model]) @@ -1318,12 +1440,16 @@ def model_losss_juxitingbak(sqlitedb,end_time): # modelmape = mape(df_combined['y'], df_combined[model]) # modelsmape = smape(df_combined['y'], df_combined[model]) # modelr2 = r2_score(df_combined['y'], df_combined[model]) - cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)]) - - model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) + cellText.append([model, round(modelmse, 3), round( + modelrmse, 3), round(modelmae, 3)]) + + model_results3 = pd.DataFrame( + cellText, columns=['模型(Model)', '平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) # 按MSE降序排列 - model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True) - model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False) + model_results3 = model_results3.sort_values( + by='平均平方误差(MSE)', ascending=True) + model_results3.to_csv(os.path.join( + dataset, "model_evaluation.csv"), index=False) modelnames = model_results3['模型(Model)'].tolist() allmodelnames = modelnames.copy() # 保存5个最佳模型的名称 @@ -1332,13 +1458,13 @@ def model_losss_juxitingbak(sqlitedb,end_time): if is_fivemodels: pass else: - with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f: + with open(os.path.join(dataset, "best_modelnames.txt"), 'w') as f: f.write(','.join(modelnames) + '\n') # 预测值与真实值对比图 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.figure(figsize=(15, 10)) - for n,model in enumerate(modelnames[:5]): + for n, model in enumerate(modelnames[:5]): plt.subplot(3, 2, n+1) plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值') plt.plot(df_combined3['ds'], df_combined3[model], label=model) @@ -1347,53 +1473,60 @@ def model_losss_juxitingbak(sqlitedb,end_time): plt.ylabel('价格') plt.title(model+'拟合') plt.subplots_adjust(hspace=0.5) - plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '预测值与真实值对比图.png'), bbox_inches='tight') plt.close() - - + # # 历史数据+预测数据 # # 拼接未来时间预测 - df_predict = pd.read_csv(os.path.join(dataset,'predict.csv')) - df_predict.drop('unique_id',inplace=True,axis=1) - df_predict.dropna(axis=1,inplace=True) + df_predict = pd.read_csv(os.path.join(dataset, 'predict.csv')) + df_predict.drop('unique_id', inplace=True, axis=1) + df_predict.dropna(axis=1, inplace=True) try: - df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d') - except ValueError : - df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d') - + df_predict['ds'] = pd.to_datetime(df_predict['ds'], format=r'%Y-%m-%d') + except ValueError: + df_predict['ds'] = pd.to_datetime(df_predict['ds'], format=r'%Y/%m/%d') + def first_row_to_database(df): # # 取第一行数据存储到数据库中 first_row = df.head(1) first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00') # 将预测结果保存到数据库 if not sqlitedb.check_table_exists('trueandpredict'): - first_row.to_sql('trueandpredict',sqlitedb.connection,index=False) + first_row.to_sql('trueandpredict', + sqlitedb.connection, index=False) else: for col in first_row.columns: - sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT') + sqlitedb.add_column_if_not_exists( + 'trueandpredict', col, 'TEXT') for row in first_row.itertuples(index=False): row_dict = row._asdict() - columns=row_dict.keys() - check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'") + columns = row_dict.keys() + check_query = sqlitedb.select_data( + 'trueandpredict', where_condition=f"ds = '{row.ds}'") if len(check_query) > 0: - set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) - sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'") + set_clause = ", ".join( + [f"{key} = '{value}'" for key, value in row_dict.items()]) + sqlitedb.update_data( + 'trueandpredict', set_clause, where_condition=f"ds = '{row.ds}'") continue - sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns) + sqlitedb.insert_data('trueandpredict', tuple( + row_dict.values()), columns=columns) first_row_to_database(df_predict) - + df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True) # 计算每个模型与最佳模型的绝对误差比例,根据设置的阈值rote筛选预测值显示最大最小值 names = [] names_df = df_combined3.copy() for col in allmodelnames: - names_df[f'{col}-{most_model_name}-误差比例'] = abs(names_df[col] - names_df[most_model_name]) / names_df[most_model_name] + names_df[f'{col}-{most_model_name}-误差比例'] = abs( + names_df[col] - names_df[most_model_name]) / names_df[most_model_name] names.append(f'{col}-{most_model_name}-误差比例') names_df = names_df[names] + def add_rote_column(row): columns = [] for r in names_df.columns: @@ -1401,21 +1534,22 @@ def model_losss_juxitingbak(sqlitedb,end_time): columns.append(r.split('-')[0]) return pd.Series([columns], index=['columns']) names_df['columns'] = names_df.apply(add_rote_column, axis=1) - + def add_upper_lower_bound(row): print(row['columns']) print(type(row['columns'])) # 计算上边界值 - upper_bound = df_combined3.loc[row.name,row['columns']].max() + upper_bound = df_combined3.loc[row.name, row['columns']].max() # 计算下边界值 - lower_bound = df_combined3.loc[row.name,row['columns']].min() + lower_bound = df_combined3.loc[row.name, row['columns']].min() return pd.Series([lower_bound, upper_bound], index=['min_within_quantile', 'max_within_quantile']) - df_combined3[['min_within_quantile','max_within_quantile']] = names_df.apply(add_upper_lower_bound, axis=1) - + df_combined3[['min_within_quantile', 'max_within_quantile'] + ] = names_df.apply(add_upper_lower_bound, axis=1) + def find_closest_values(row): x = row.y if x is None or np.isnan(x): - return pd.Series([None, None], index=['min_price','max_price']) + return pd.Series([None, None], index=['min_price', 'max_price']) # row = row.drop('ds') row = row.values.tolist() row.sort() @@ -1423,26 +1557,28 @@ def model_losss_juxitingbak(sqlitedb,end_time): # x 在row中的索引 index = row.index(x) if index == 0: - return pd.Series([row[index+1], row[index+2]], index=['min_price','max_price']) + return pd.Series([row[index+1], row[index+2]], index=['min_price', 'max_price']) elif index == len(row)-1: - return pd.Series([row[index-2], row[index-1]], index=['min_price','max_price']) + return pd.Series([row[index-2], row[index-1]], index=['min_price', 'max_price']) else: - return pd.Series([row[index-1], row[index+1]], index=['min_price','max_price']) + return pd.Series([row[index-1], row[index+1]], index=['min_price', 'max_price']) def find_most_common_model(): # 最多频率的模型名称 - min_model_max_frequency_model = df_combined3['min_model'].tail(60).value_counts().idxmax() - max_model_max_frequency_model = df_combined3['max_model'].tail(60).value_counts().idxmax() + min_model_max_frequency_model = df_combined3['min_model'].tail( + 60).value_counts().idxmax() + max_model_max_frequency_model = df_combined3['max_model'].tail( + 60).value_counts().idxmax() if min_model_max_frequency_model == max_model_max_frequency_model: # 取60天第二多的模型 - max_model_max_frequency_model = df_combined3['max_model'].tail(60).value_counts().nlargest(2).index[1] + max_model_max_frequency_model = df_combined3['max_model'].tail( + 60).value_counts().nlargest(2).index[1] df_predict['min_model'] = min_model_max_frequency_model df_predict['max_model'] = max_model_max_frequency_model df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model] df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model] - # find_most_common_model() df_combined3['ds'] = pd.to_datetime(df_combined3['ds']) @@ -1451,47 +1587,49 @@ def model_losss_juxitingbak(sqlitedb,end_time): # 保存到数据库 if not sqlitedb.check_table_exists('accuracy'): - columns = ','.join(df_combined3.columns.to_list()+['id','CREAT_DATE','min_price','max_price','LOW_PRICE','HIGH_PRICE','mean']) - sqlitedb.create_table('accuracy',columns=columns) - existing_data = sqlitedb.select_data(table_name = "accuracy") + columns = ','.join(df_combined3.columns.to_list( + )+['id', 'CREAT_DATE', 'min_price', 'max_price', 'LOW_PRICE', 'HIGH_PRICE', 'mean']) + sqlitedb.create_table('accuracy', columns=columns) + existing_data = sqlitedb.select_data(table_name="accuracy") - if not existing_data.empty: + if not existing_data.empty: max_id = existing_data['id'].astype(int).max() df_predict2['id'] = range(max_id + 1, max_id + 1 + len(df_predict2)) else: df_predict2['id'] = range(1, 1 + len(df_predict2)) df_predict2['CREAT_DATE'] = end_time - save_to_database(sqlitedb,df_predict2,"accuracy",end_time) + save_to_database(sqlitedb, df_predict2, "accuracy", end_time) # 上周准确率计算 - accuracy_df = sqlitedb.select_data(table_name = "accuracy") + accuracy_df = sqlitedb.select_data(table_name="accuracy") predict_y = accuracy_df.copy() # ids = predict_y[predict_y['min_price'].isnull()]['id'].tolist() ids = predict_y['id'].tolist() # 准确率基准与绘图上下界逻辑一致 # predict_y[['min_price','max_price']] = predict_y[['min_within_quantile','max_within_quantile']] - # 模型评估前五均值 + # 模型评估前五均值 # predict_y['min_price'] = predict_y[modelnames].mean(axis=1) -1 # predict_y['max_price'] = predict_y[modelnames].mean(axis=1) +1 - # 模型评估前十均值 - predict_y['min_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) -1.5 + # 模型评估前十均值 + predict_y['min_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) - 1.5 predict_y['mean'] = predict_y[allmodelnames[0:10]].mean(axis=1) - predict_y['max_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) +1.5 + predict_y['max_price'] = predict_y[allmodelnames[0:10]].mean(axis=1) + 1.5 # 模型评估前十最大最小 # allmodelnames 和 predict_y 列 重复的 # allmodelnames = [col for col in allmodelnames if col in predict_y.columns] - # predict_y['min_price'] = predict_y[allmodelnames[0:10]].min(axis=1) + # predict_y['min_price'] = predict_y[allmodelnames[0:10]].min(axis=1) # predict_y['max_price'] = predict_y[allmodelnames[0:10]].max(axis=1) for id in ids: row = predict_y[predict_y['id'] == id] try: - sqlitedb.update_data('accuracy',f"min_price = {row['min_price'].values[0]},max_price = {row['max_price'].values[0]},mean={row['mean'].values[0]}",f"id = {id}") + sqlitedb.update_data( + 'accuracy', f"min_price = {row['min_price'].values[0]},max_price = {row['max_price'].values[0]},mean={row['mean'].values[0]}", f"id = {id}") except: logger.error(f'更新accuracy表中的min_price,max_price,mean值失败,row={row}') - + df = accuracy_df.copy() - df['ds'] = pd.to_datetime(df['ds']) + df['ds'] = pd.to_datetime(df['ds']) df = df.reindex() # 判断预测值在不在布伦特最高最低价范围内,准确率为1,否则为0 @@ -1507,53 +1645,62 @@ def model_losss_juxitingbak(sqlitedb,end_time): # 比较真实最高最低,和预测最高最低 计算准确率 # 全子集情况: if (row['max_price'] >= row['HIGH_PRICE'] and row['min_price'] <= row['LOW_PRICE']) or \ - (row['max_price'] <= row['HIGH_PRICE'] and row['min_price'] >= row['LOW_PRICE']): - return 1 + (row['max_price'] <= row['HIGH_PRICE'] and row['min_price'] >= row['LOW_PRICE']): + return 1 # 无交集情况: if row['max_price'] < row['LOW_PRICE'] or \ - row['min_price'] > row['HIGH_PRICE']: + row['min_price'] > row['HIGH_PRICE']: return 0 # 有交集情况: else: - sorted_prices = sorted([row['LOW_PRICE'], row['min_price'], row['max_price'], row['HIGH_PRICE']]) + sorted_prices = sorted( + [row['LOW_PRICE'], row['min_price'], row['max_price'], row['HIGH_PRICE']]) middle_diff = sorted_prices[2] - sorted_prices[1] price_range = row['HIGH_PRICE'] - row['LOW_PRICE'] accuracy = middle_diff / price_range return accuracy - columns = ['HIGH_PRICE','LOW_PRICE','min_price','max_price'] + columns = ['HIGH_PRICE', 'LOW_PRICE', 'min_price', 'max_price'] df[columns] = df[columns].astype(float) df['ACCURACY'] = df.apply(calculate_accuracy, axis=1) # df['ACCURACY'] = df.apply(is_within_range, axis=1) # 计算准确率并保存结果 - def _get_accuracy_rate(df,create_dates,ds_dates): + def _get_accuracy_rate(df, create_dates, ds_dates): df3 = df.copy() df3 = df3[df3['CREAT_DATE'].isin(create_dates)] df3 = df3[df3['ds'].isin(ds_dates)] accuracy_rote = 0 - for i,group in df3.groupby('CREAT_DATE'): - accuracy_rote += (group['ACCURACY'].sum()/len(group))*weight_dict[len(group)-1] - accuracy_rote = round(accuracy_rote,2) - df4 = pd.DataFrame(columns=['开始日期','结束日期','准确率']) - df4.loc[len(df4)] = {'开始日期':ds_dates[0],'结束日期':ds_dates[-1],'准确率':accuracy_rote} - df4.to_sql("accuracy_rote", con=sqlitedb.connection, if_exists='append', index=False) - create_dates,ds_dates = get_week_date(end_time) - _get_accuracy_rate(df,create_dates,ds_dates) - + for i, group in df3.groupby('CREAT_DATE'): + accuracy_rote += (group['ACCURACY'].sum() / + len(group))*weight_dict[len(group)-1] + accuracy_rote = round(accuracy_rote, 2) + df4 = pd.DataFrame(columns=['开始日期', '结束日期', '准确率']) + df4.loc[len(df4)] = {'开始日期': ds_dates[0], + '结束日期': ds_dates[-1], '准确率': accuracy_rote} + df4.to_sql("accuracy_rote", con=sqlitedb.connection, + if_exists='append', index=False) + create_dates, ds_dates = get_week_date(end_time) + _get_accuracy_rate(df, create_dates, ds_dates) + def _add_abs_error_rate(): # 计算每个预测值与真实值之间的偏差率 for model in allmodelnames: - df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y'] + df_combined3[f'{model}_abs_error_rate'] = abs( + df_combined3['y'] - df_combined3[model]) / df_combined3['y'] # 获取每行对应的最小偏差率值 - min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) + min_abs_error_rate_values = df_combined3.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) # 获取每行对应的最小偏差率值对应的列名 - min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) + min_abs_error_rate_column_name = df_combined3.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) # 将列名索引转换为列名 - min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0]) + min_abs_error_rate_column_name = min_abs_error_rate_column_name.map( + lambda x: x.split('_')[0]) # 获取最小偏差率对应的模型的预测值 - min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) + min_abs_error_rate_predictions = df_combined3.apply( + lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) # 将最小偏差率对应的模型的预测值添加到DataFrame中 df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name @@ -1567,24 +1714,26 @@ def model_losss_juxitingbak(sqlitedb,end_time): df_combined3[col] = df_combined3[col].round(2) except ValueError: pass - df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False) - - - # 历史价格+预测价格 + df_combined3.to_csv(os.path.join( + dataset, "testandpredict_groupby.csv"), index=False) + + # 历史价格+预测价格 sqlitedb.drop_table('testandpredict_groupby') - df_combined3.to_sql('testandpredict_groupby',sqlitedb.connection,index=False) + df_combined3.to_sql('testandpredict_groupby', + sqlitedb.connection, index=False) def _plt_predict_ture(df): lens = df.shape[0] if df.shape[0] < 180 else 90 - df = df[-lens:] # 取180个数据点画图 + df = df[-lens:] # 取180个数据点画图 # 历史价格 plt.figure(figsize=(20, 10)) # 时间格式更改 df['ds'] = pd.to_datetime(df['ds']) - + plt.plot(df['ds'], df['y'], label='真实值') # 颜色填充 - plt.fill_between(df['ds'], df['max_within_quantile'], df['min_within_quantile'], alpha=0.2) + plt.fill_between(df['ds'], df['max_within_quantile'], + df['min_within_quantile'], alpha=0.2) # markers = ['o', 's', '^', 'D', 'v', '*', 'p', 'h', 'H', '+', 'x', 'd'] # random_marker = random.choice(markers) # for model in allmodelnames: @@ -1598,7 +1747,7 @@ def model_losss_juxitingbak(sqlitedb,end_time): plt.text(i, j, str(j), ha='center', va='bottom') for model in most_model: - plt.plot(df['ds'], df[model], label=model,marker='o') + plt.plot(df['ds'], df[model], label=model, marker='o') # 当前日期画竖虚线 plt.axvline(x=df['ds'].iloc[-horizon], color='r', linestyle='--') plt.legend() @@ -1609,22 +1758,22 @@ def model_losss_juxitingbak(sqlitedb,end_time): plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator()) plt.xticks(rotation=45) # 日期标签旋转45度,防止重叠 plt.ylabel('价格') - - plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight') + + plt.savefig(os.path.join(dataset, '历史价格-预测值.png'), bbox_inches='tight') plt.close() - def _plt_modeltopten_predict_ture(df): df['ds'] = pd.to_datetime(df['ds']) df['max_cutoff'] = df.groupby('ds')['CREAT_DATE'].transform('max') df = df[df['CREAT_DATE'] == df['max_cutoff']] df['mean'] = df['mean'].astype(float) lens = df.shape[0] if df.shape[0] < 180 else 180 - df = df[-lens:] # 取180个数据点画图 + df = df[-lens:] # 取180个数据点画图 # 历史价格 plt.figure(figsize=(20, 10)) plt.plot(df['ds'], df['y'], label='真实值') - plt.plot(df['ds'], df['mean'], label='模型前十均值', linestyle='--', color='orange') + plt.plot(df['ds'], df['mean'], label='模型前十均值', + linestyle='--', color='orange') # 颜色填充 plt.fill_between(df['ds'], df['max_price'], df['min_price'], alpha=0.2) # markers = ['o', 's', '^', 'D', 'v', '*', 'p', 'h', 'H', '+', 'x', 'd'] @@ -1646,44 +1795,46 @@ def model_losss_juxitingbak(sqlitedb,end_time): # 自动设置横轴日期显示 plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator()) plt.xticks(rotation=45) # 日期标签旋转45度,防止重叠 - + plt.ylabel('价格') - - plt.savefig(os.path.join(dataset,'历史价格-预测值1.png'), bbox_inches='tight') + + plt.savefig(os.path.join(dataset, '历史价格-预测值1.png'), + bbox_inches='tight') plt.close() - - def _plt_predict_table(df): + def _plt_predict_table(df): # 预测值表格 fig, ax = plt.subplots(figsize=(20, 6)) ax.axis('off') # 关闭坐标轴 # 数值保留2位小数 df = df.round(2) df = df[-horizon:] - df['Day'] = [f'Day_{i}' for i in range(1,horizon+1)] + df['Day'] = [f'Day_{i}' for i in range(1, horizon+1)] # Day列放到最前面 df = df[['Day'] + list(df.columns[:-1])] - table = ax.table(cellText=df.values, colLabels=df.columns, loc='center') - #加宽表格 - table.auto_set_font_size(False) - table.set_fontsize(10) - - # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight') - plt.close() - - def _plt_model_results3(): - # 可视化评估结果 - plt.rcParams['font.sans-serif'] = ['SimHei'] - fig, ax = plt.subplots(figsize=(20, 10)) - ax.axis('off') # 关闭坐标轴 - table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center') + table = ax.table(cellText=df.values, + colLabels=df.columns, loc='center') # 加宽表格 table.auto_set_font_size(False) table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '预测值表格.png'), bbox_inches='tight') + plt.close() + + def _plt_model_results3(): + # 可视化评估结果 + plt.rcParams['font.sans-serif'] = ['SimHei'] + fig, ax = plt.subplots(figsize=(20, 10)) + ax.axis('off') # 关闭坐标轴 + table = ax.table(cellText=model_results3.values, + colLabels=model_results3.columns, loc='center') + # 加宽表格 + table.auto_set_font_size(False) + table.set_fontsize(10) + + # 设置表格样式,列数据最小的用绿色标识 + plt.savefig(os.path.join(dataset, '模型评估.png'), bbox_inches='tight') plt.close() _plt_predict_ture(df_combined3) @@ -1692,45 +1843,48 @@ def model_losss_juxitingbak(sqlitedb,end_time): _plt_model_results3() return model_results3 - + # 聚烯烃计算预测评估指数 @exception_logger def model_losss_juxiting(sqlitedb): global dataset global rote - most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]] + most_model = [sqlitedb.select_data('most_model', columns=[ + 'most_common_model'], order_by='ds desc', limit=1).values[0][0]] most_model_name = most_model[0] # 预测数据处理 predict - df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv")) + df_combined = loadcsv(os.path.join(dataset, "cross_validation.csv")) df_combined = dateConvert(df_combined) # 删除空列 - df_combined.dropna(axis=1,inplace=True) - # 删除缺失值,预测过程不能有缺失值 - df_combined.dropna(inplace=True) + df_combined.dropna(axis=1, inplace=True) + # 删除缺失值,预测过程不能有缺失值 + df_combined.dropna(inplace=True) # 其他列转为数值类型 - df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] }) + df_combined = df_combined.astype( + {col: 'float32' for col in df_combined.columns if col not in ['cutoff', 'ds']}) # 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值 - df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max') + df_combined['max_cutoff'] = df_combined.groupby( + 'ds')['cutoff'].transform('max') # 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列 - df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']] + df_combined = df_combined[df_combined['cutoff'] + == df_combined['max_cutoff']] # 删除模型生成的cutoff列 df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True) # 获取模型名称 - modelnames = df_combined.columns.to_list()[1:] + modelnames = df_combined.columns.to_list()[1:] if 'y' in modelnames: modelnames.remove('y') if 'ds' in modelnames: modelnames.remove('ds') df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要 - # 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE cellText = [] - # 遍历模型名称,计算模型评估指标 + # 遍历模型名称,计算模型评估指标 for model in modelnames: modelmse = mse(df_combined['y'], df_combined[model]) modelrmse = rmse(df_combined['y'], df_combined[model]) @@ -1738,27 +1892,31 @@ def model_losss_juxiting(sqlitedb): # modelmape = mape(df_combined['y'], df_combined[model]) # modelsmape = smape(df_combined['y'], df_combined[model]) # modelr2 = r2_score(df_combined['y'], df_combined[model]) - cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)]) - - model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) + cellText.append([model, round(modelmse, 3), round( + modelrmse, 3), round(modelmae, 3)]) + + model_results3 = pd.DataFrame( + cellText, columns=['模型(Model)', '平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) # 按MSE降序排列 - model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True) - model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False) + model_results3 = model_results3.sort_values( + by='平均平方误差(MSE)', ascending=True) + model_results3.to_csv(os.path.join( + dataset, "model_evaluation.csv"), index=False) modelnames = model_results3['模型(Model)'].tolist() allmodelnames = modelnames.copy() # 保存5个最佳模型的名称 if len(modelnames) > 5: modelnames = modelnames[0:5] - if is_fivemodels: + if is_fivemodels: pass else: - with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f: + with open(os.path.join(dataset, "best_modelnames.txt"), 'w') as f: f.write(','.join(modelnames) + '\n') # 预测值与真实值对比图 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.figure(figsize=(15, 10)) - for n,model in enumerate(modelnames[:5]): + for n, model in enumerate(modelnames[:5]): plt.subplot(3, 2, n+1) plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值') plt.plot(df_combined3['ds'], df_combined3[model], label=model) @@ -1767,20 +1925,19 @@ def model_losss_juxiting(sqlitedb): plt.ylabel('价格') plt.title(model+'拟合') plt.subplots_adjust(hspace=0.5) - plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '预测值与真实值对比图.png'), bbox_inches='tight') plt.close() - - + # # 历史数据+预测数据 # # 拼接未来时间预测 - df_predict = pd.read_csv(os.path.join(dataset,'predict.csv')) - df_predict.drop('unique_id',inplace=True,axis=1) - df_predict.dropna(axis=1,inplace=True) + df_predict = pd.read_csv(os.path.join(dataset, 'predict.csv')) + df_predict.drop('unique_id', inplace=True, axis=1) + df_predict.dropna(axis=1, inplace=True) try: - df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d') - except ValueError : - df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d') + df_predict['ds'] = pd.to_datetime(df_predict['ds'], format=r'%Y-%m-%d') + except ValueError: + df_predict['ds'] = pd.to_datetime(df_predict['ds'], format=r'%Y/%m/%d') def first_row_to_database(df): # # 取第一行数据存储到数据库中 @@ -1788,19 +1945,25 @@ def model_losss_juxiting(sqlitedb): first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00') # 将预测结果保存到数据库 if not sqlitedb.check_table_exists('trueandpredict'): - first_row.to_sql('trueandpredict',sqlitedb.connection,index=False) + first_row.to_sql('trueandpredict', + sqlitedb.connection, index=False) else: for col in first_row.columns: - sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT') + sqlitedb.add_column_if_not_exists( + 'trueandpredict', col, 'TEXT') for row in first_row.itertuples(index=False): row_dict = row._asdict() - columns=row_dict.keys() - check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'") + columns = row_dict.keys() + check_query = sqlitedb.select_data( + 'trueandpredict', where_condition=f"ds = '{row.ds}'") if len(check_query) > 0: - set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) - sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'") + set_clause = ", ".join( + [f"{key} = '{value}'" for key, value in row_dict.items()]) + sqlitedb.update_data( + 'trueandpredict', set_clause, where_condition=f"ds = '{row.ds}'") continue - sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns) + sqlitedb.insert_data('trueandpredict', tuple( + row_dict.values()), columns=columns) first_row_to_database(df_predict) @@ -1810,10 +1973,12 @@ def model_losss_juxiting(sqlitedb): names = [] names_df = df_combined3.copy() for col in allmodelnames: - names_df[f'{col}-{most_model_name}-误差比例'] = abs(names_df[col] - names_df[most_model_name]) / names_df[most_model_name] + names_df[f'{col}-{most_model_name}-误差比例'] = abs( + names_df[col] - names_df[most_model_name]) / names_df[most_model_name] names.append(f'{col}-{most_model_name}-误差比例') names_df = names_df[names] + def add_rote_column(row): columns = [] for r in names_df.columns: @@ -1821,52 +1986,58 @@ def model_losss_juxiting(sqlitedb): columns.append(r.split('-')[0]) return pd.Series([columns], index=['columns']) names_df['columns'] = names_df.apply(add_rote_column, axis=1) - + def add_upper_lower_bound(row): print(row['columns']) print(type(row['columns'])) # 计算上边界值 - upper_bound = df_combined3.loc[row.name,row['columns']].max() + upper_bound = df_combined3.loc[row.name, row['columns']].max() # 计算下边界值 - lower_bound = df_combined3.loc[row.name,row['columns']].min() + lower_bound = df_combined3.loc[row.name, row['columns']].min() return pd.Series([lower_bound, upper_bound], index=['min_within_quantile', 'max_within_quantile']) - df_combined3[['min_within_quantile','max_within_quantile']] = names_df.apply(add_upper_lower_bound, axis=1) - - + df_combined3[['min_within_quantile', 'max_within_quantile'] + ] = names_df.apply(add_upper_lower_bound, axis=1) + def find_most_common_model(): # 最多频率的模型名称 - min_model_max_frequency_model = df_combined3['min_model'].tail(20).value_counts().idxmax() - max_model_max_frequency_model = df_combined3['max_model'].tail(20).value_counts().idxmax() + min_model_max_frequency_model = df_combined3['min_model'].tail( + 20).value_counts().idxmax() + max_model_max_frequency_model = df_combined3['max_model'].tail( + 20).value_counts().idxmax() if min_model_max_frequency_model == max_model_max_frequency_model: # 取20天第二多的模型 - max_model_max_frequency_model = df_combined3['max_model'].tail(20).value_counts().nlargest(2).index[1] + max_model_max_frequency_model = df_combined3['max_model'].tail( + 20).value_counts().nlargest(2).index[1] df_predict['min_model'] = min_model_max_frequency_model df_predict['max_model'] = max_model_max_frequency_model df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model] df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model] - # find_most_common_model() df_predict2 = df_predict.copy() df_predict2['ds'] = pd.to_datetime(df_predict2['ds']) df_predict2['ds'] = df_predict2['ds'].dt.strftime('%Y-%m-%d') - def _add_abs_error_rate(): # 计算每个预测值与真实值之间的偏差率 for model in allmodelnames: - df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y'] + df_combined3[f'{model}_abs_error_rate'] = abs( + df_combined3['y'] - df_combined3[model]) / df_combined3['y'] # 获取每行对应的最小偏差率值 - min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) + min_abs_error_rate_values = df_combined3.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) # 获取每行对应的最小偏差率值对应的列名 - min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) + min_abs_error_rate_column_name = df_combined3.apply( + lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) # 将列名索引转换为列名 - min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0]) + min_abs_error_rate_column_name = min_abs_error_rate_column_name.map( + lambda x: x.split('_')[0]) # 获取最小偏差率对应的模型的预测值 - min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) + min_abs_error_rate_predictions = df_combined3.apply( + lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) # 将最小偏差率对应的模型的预测值添加到DataFrame中 df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name @@ -1881,21 +2052,23 @@ def model_losss_juxiting(sqlitedb): df_combined3[col] = df_combined3[col].round(2) except ValueError: pass - df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False) - - - # 历史价格+预测价格 + df_combined3.to_csv(os.path.join( + dataset, "testandpredict_groupby.csv"), index=False) + + # 历史价格+预测价格 sqlitedb.drop_table('testandpredict_groupby') - df_combined3.to_sql('testandpredict_groupby',sqlitedb.connection,index=False) - + df_combined3.to_sql('testandpredict_groupby', + sqlitedb.connection, index=False) + def _plt_predict_ture(df): lens = df.shape[0] if df.shape[0] < 180 else 90 - df = df[-lens:] # 取180个数据点画图 + df = df[-lens:] # 取180个数据点画图 # 历史价格 plt.figure(figsize=(20, 10)) plt.plot(df['ds'], df['y'], label='真实值') # 颜色填充 - plt.fill_between(df['ds'], df['max_within_quantile'], df['min_within_quantile'], alpha=0.2) + plt.fill_between(df['ds'], df['max_within_quantile'], + df['min_within_quantile'], alpha=0.2) # markers = ['o', 's', '^', 'D', 'v', '*', 'p', 'h', 'H', '+', 'x', 'd'] # random_marker = random.choice(markers) # for model in allmodelnames: @@ -1909,47 +2082,49 @@ def model_losss_juxiting(sqlitedb): plt.text(i, j, str(j), ha='center', va='bottom') for model in most_model: - plt.plot(df['ds'], df[model], label=model,marker='o') + plt.plot(df['ds'], df[model], label=model, marker='o') # 当前日期画竖虚线 plt.axvline(x=df['ds'].iloc[-horizon], color='r', linestyle='--') plt.legend() plt.xlabel('日期') plt.ylabel('价格') - - plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight') + + plt.savefig(os.path.join(dataset, '历史价格-预测值.png'), bbox_inches='tight') plt.close() - def _plt_predict_table(df): + def _plt_predict_table(df): # 预测值表格 fig, ax = plt.subplots(figsize=(20, 6)) ax.axis('off') # 关闭坐标轴 # 数值保留2位小数 df = df.round(2) df = df[-horizon:] - df['Day'] = [f'Day_{i}' for i in range(1,horizon+1)] + df['Day'] = [f'Day_{i}' for i in range(1, horizon+1)] # Day列放到最前面 df = df[['Day'] + list(df.columns[:-1])] - table = ax.table(cellText=df.values, colLabels=df.columns, loc='center') - #加宽表格 - table.auto_set_font_size(False) - table.set_fontsize(10) - - # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight') - plt.close() - - def _plt_model_results3(): - # 可视化评估结果 - plt.rcParams['font.sans-serif'] = ['SimHei'] - fig, ax = plt.subplots(figsize=(20, 10)) - ax.axis('off') # 关闭坐标轴 - table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center') + table = ax.table(cellText=df.values, + colLabels=df.columns, loc='center') # 加宽表格 table.auto_set_font_size(False) table.set_fontsize(10) # 设置表格样式,列数据最小的用绿色标识 - plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '预测值表格.png'), bbox_inches='tight') + plt.close() + + def _plt_model_results3(): + # 可视化评估结果 + plt.rcParams['font.sans-serif'] = ['SimHei'] + fig, ax = plt.subplots(figsize=(20, 10)) + ax.axis('off') # 关闭坐标轴 + table = ax.table(cellText=model_results3.values, + colLabels=model_results3.columns, loc='center') + # 加宽表格 + table.auto_set_font_size(False) + table.set_fontsize(10) + + # 设置表格样式,列数据最小的用绿色标识 + plt.savefig(os.path.join(dataset, '模型评估.png'), bbox_inches='tight') plt.close() _plt_predict_ture(df_combined3) @@ -1959,100 +2134,106 @@ def model_losss_juxiting(sqlitedb): return model_results3 -import matplotlib.dates as mdates @exception_logger -def brent_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsize=5,dataset='dataset',time = '2024-07-30',reportname='report.pdf',sqlitedb='jbsh_yuanyou.db'): +def brent_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, inputsize=5, dataset='dataset', time='2024-07-30', reportname='report.pdf', sqlitedb='jbsh_yuanyou.db'): global y # 创建内容对应的空列表 content = list() # 获取特征的近一月值 import pandas as pd - feature_data_df = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'), parse_dates=['ds']).tail(60) - def draw_feature_trend(feature_data_df, features): - # 画特征近60天的趋势图 - feature_df = feature_data_df[['ds','y']+features] - # 遍历X每一列,和yy画散点图 , - - for i, col in enumerate(features): - # try: - print(f'正在绘制第{i+1}个特征{col}与价格散点图...') - if col not in ['ds', 'y']: - fig, ax1 = plt.subplots(figsize=(10, 6)) - # 在第一个坐标轴上绘制数据 - sns.lineplot(data=feature_df, x='ds', y='y', ax=ax1, color='b') - ax1.set_xlabel('日期') - ax1.set_ylabel('y', color='b') - ax1.tick_params('y', colors='b') - # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 - for j in range(1, len(feature_df), 2): - value = feature_df['y'].iloc[j] - date = feature_df['ds'].iloc[j] - offset = 1.001 - ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) - # 创建第二个坐标轴 - ax2 = ax1.twinx() - # 在第二个坐标轴上绘制数据 - sns.lineplot(data=feature_df, x='ds', y=col, ax=ax2, color='r') - ax2.set_ylabel(col, color='r') - ax2.tick_params('y', colors='r') - # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 - for j in range(0, len(feature_df), 2): - value = feature_df[col].iloc[j] - date = feature_df['ds'].iloc[j] - offset = 1.0003 - ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) - # 添加标题 - plt.title(col) - # 设置横坐标为日期格式并自动调整 - locator = mdates.AutoDateLocator() - formatter = mdates.AutoDateFormatter(locator) - ax1.xaxis.set_major_locator(locator) - ax1.xaxis.set_major_formatter(formatter) - # 文件名特殊字符处理 - col = col.replace('*', '-') - col = col.replace(':', '-') - col = col.replace(r'/', '-') - plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) - content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) - plt.close() - # except Exception as e: - # print(f'绘制第{i+1}个特征{col}与价格散点图时出错:{e}') - + feature_data_df = pd.read_csv(os.path.join( + dataset, '指标数据添加时间特征.csv'), parse_dates=['ds']).tail(60) - ### 添加标题 + def draw_feature_trend(feature_data_df, features): + # 画特征近60天的趋势图 + feature_df = feature_data_df[['ds', 'y']+features] + # 遍历X每一列,和yy画散点图 , + + for i, col in enumerate(features): + # try: + print(f'正在绘制第{i+1}个特征{col}与价格散点图...') + if col not in ['ds', 'y']: + fig, ax1 = plt.subplots(figsize=(10, 6)) + # 在第一个坐标轴上绘制数据 + sns.lineplot(data=feature_df, x='ds', y='y', ax=ax1, color='b') + ax1.set_xlabel('日期') + ax1.set_ylabel('y', color='b') + ax1.tick_params('y', colors='b') + # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(1, len(feature_df), 2): + value = feature_df['y'].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax1.text(date, value * offset, str(round(value, 2)), + ha='center', va='bottom', color='b', fontsize=10) + # 创建第二个坐标轴 + ax2 = ax1.twinx() + # 在第二个坐标轴上绘制数据 + sns.lineplot(data=feature_df, x='ds', y=col, ax=ax2, color='r') + ax2.set_ylabel(col, color='r') + ax2.tick_params('y', colors='r') + # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(0, len(feature_df), 2): + value = feature_df[col].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.0003 + ax2.text(date, value * offset, str(round(value, 2)), + ha='center', va='bottom', color='r', fontsize=10) + # 添加标题 + plt.title(col) + # 设置横坐标为日期格式并自动调整 + locator = mdates.AutoDateLocator() + formatter = mdates.AutoDateFormatter(locator) + ax1.xaxis.set_major_locator(locator) + ax1.xaxis.set_major_formatter(formatter) + # 文件名特殊字符处理 + col = col.replace('*', '-') + col = col.replace(':', '-') + col = col.replace(r'/', '-') + plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + content.append(Graphs.draw_img( + os.path.join(dataset, f'{col}与价格散点图.png'))) + plt.close() + # except Exception as e: + # print(f'绘制第{i+1}个特征{col}与价格散点图时出错:{e}') + + # 添加标题 content.append(Graphs.draw_title(f'{y}{time}预测报告')) - ### 预测结果 + # 预测结果 content.append(Graphs.draw_little_title('一、预测结果:')) # 添加历史走势及预测价格的走势图片 - content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png'))) + content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值.png'))) # 波动率画图逻辑 content.append(Graphs.draw_text('图示说明:')) - content.append(Graphs.draw_text(' 确定置信区间:设置残差置信阈值,以每周最佳模型为基准,选取在置信区间的预测值作为置信区间;')) + content.append(Graphs.draw_text( + ' 确定置信区间:设置残差置信阈值,以每周最佳模型为基准,选取在置信区间的预测值作为置信区间;')) - # 添加历史走势及预测价格的走势图片 - content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值1.png'))) + content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值1.png'))) content.append(Graphs.draw_text('图示说明:')) - content.append(Graphs.draw_text(' 确定置信区间:使用模型评估指标MAE得到前十个模型,取平均值上下1.5作为价格波动置信区间;')) - + content.append(Graphs.draw_text( + ' 确定置信区间:使用模型评估指标MAE得到前十个模型,取平均值上下1.5作为价格波动置信区间;')) # 取df中y列为空的行 import pandas as pd - df = pd.read_csv(os.path.join(dataset,'predict.csv'),encoding='gbk') - df_true = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 获取预测日期对应的真实值 - df_true = df_true[['ds','y']] - eval_df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + df = pd.read_csv(os.path.join(dataset, 'predict.csv'), encoding='gbk') + df_true = pd.read_csv(os.path.join( + dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 + df_true = df_true[['ds', 'y']] + eval_df = pd.read_csv(os.path.join( + dataset, 'model_evaluation.csv'), encoding='utf-8') # 按评估指标排序,取前五 fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用 # 取 fivemodels_list 和 ds 列 - df = df[['ds'] + fivemodels_list.tolist() ] + df = df[['ds'] + fivemodels_list.tolist()] # 拼接预测日期对应的真实值 df = pd.merge(df, df_true, on='ds', how='left') # 删除全部为nan的列 df = df.dropna(how='all', axis=1) # 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入 - num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])] + num_cols = [col for col in df.columns if col != + 'ds' and pd.api.types.is_numeric_dtype(df[col])] for col in num_cols: df[col] = df[col].astype(float).round(2) # 添加最大值、最小值、平均值三列 @@ -2066,72 +2247,78 @@ def brent_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inpu # 添加预测值表格 data = df.values.tolist() col_width = 500/len(df.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) - df = pd.read_csv(os.path.join(dataset,'testandpredict_groupby.csv'),encoding='utf-8') - df4 = df.copy() # 计算偏差率使用 + df = pd.read_csv(os.path.join( + dataset, 'testandpredict_groupby.csv'), encoding='utf-8') + df4 = df.copy() # 计算偏差率使用 # 去掉created_dt 列 df4 = df4.drop(columns=['created_dt']) # 计算模型偏差率 - #计算各列对于y列的差值百分比 + # 计算各列对于y列的差值百分比 df3 = pd.DataFrame() # 存储偏差率 - + # 删除有null的行 df4 = df4.dropna() df3['ds'] = df4['ds'] for col in fivemodels_list: - df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2) + df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100, 2) # 找出决定系数前五的偏差率 df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:] # 找出上一预测区间的时间 stime = df3['ds'].iloc[0] etime = df3['ds'].iloc[-1] # 添加偏差率表格 - fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 - content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) + fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 + content.append(Graphs.draw_text( + f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) # # 添加偏差率表格 df3 = df3.T df3 = df3.reset_index() data = df3.values.tolist() col_width = 500/len(df3.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) - content.append(Graphs.draw_little_title('上一周预测准确率:')) - df4 = sqlitedb.select_data('accuracy_rote',order_by='结束日期 desc',limit=1) + df4 = sqlitedb.select_data('accuracy_rote', order_by='结束日期 desc', limit=1) df4 = df4.T df4 = df4.reset_index() df4 = df4.T data = df4.values.tolist() col_width = 500/len(df4.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('三、预测过程解析:')) - ### 特征、模型、参数配置 + # 特征、模型、参数配置 content.append(Graphs.draw_little_title('模型选择:')) - content.append(Graphs.draw_text(f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) + content.append(Graphs.draw_text( + f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。')) content.append(Graphs.draw_little_title('指标情况:')) - with open(os.path.join(dataset,'特征频度统计.txt'),encoding='utf-8') as f: + with open(os.path.join(dataset, '特征频度统计.txt'), encoding='utf-8') as f: for line in f.readlines(): content.append(Graphs.draw_text(line)) - data = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 计算相关系数用 - df_zhibiaofenlei = loadcsv(os.path.join(dataset,'特征处理后的指标名称及分类.csv')) # 气泡图用 + data = pd.read_csv(os.path.join(dataset, '指标数据添加时间特征.csv'), + encoding='utf-8') # 计算相关系数用 + df_zhibiaofenlei = loadcsv(os.path.join( + dataset, '特征处理后的指标名称及分类.csv')) # 气泡图用 df_zhibiaoshuju = data.copy() # 气泡图用 # 绘制特征相关气泡图 - + grouped = df_zhibiaofenlei.groupby('指标分类') grouped_corr = pd.DataFrame(columns=['指标分类', '指标数量', '相关性总和']) - + content.append(Graphs.draw_little_title('按指标分类分别与预测目标进行皮尔逊相关系数分析:')) - content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) - content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) + content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) + content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) content.append(Graphs.draw_text(''' - 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) - content.append(Graphs.draw_text('''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) - content.append(Graphs.draw_text('''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) + 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) + content.append(Graphs.draw_text( + '''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) + content.append(Graphs.draw_text( + '''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) for name, group in grouped: cols = group['指标名称'].tolist() logger.info(f'开始绘制{name}类指标的相关性直方图') @@ -2140,72 +2327,89 @@ def brent_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inpu correlation_matrix = df_zhibiaoshuju[feature_names].corr()['y'] # 绘制特征相关性直方分布图 - plt.figure(figsize=(10,8)) - sns.histplot(correlation_matrix.values.flatten(), bins=20, kde=True, color='skyblue') + plt.figure(figsize=(10, 8)) + sns.histplot(correlation_matrix.values.flatten(), + bins=20, kde=True, color='skyblue') plt.title(f'{name}类指标(共{len(cols_subset)}个)相关性直方分布图') plt.xlabel('相关系数') plt.ylabel('频数') - plt.savefig(os.path.join(dataset, f'{name}类指标相关性直方分布图.png'), bbox_inches='tight') + plt.savefig(os.path.join( + dataset, f'{name}类指标相关性直方分布图.png'), bbox_inches='tight') plt.close() - content.append(Graphs.draw_img(os.path.join(dataset,f'{name}类指标相关性直方分布图.png'))) - content.append(Graphs.draw_text(f'{name}类指标(共{len(cols_subset)}个)的相关性直方分布图如上所示。')) + content.append(Graphs.draw_img( + os.path.join(dataset, f'{name}类指标相关性直方分布图.png'))) + content.append(Graphs.draw_text( + f'{name}类指标(共{len(cols_subset)}个)的相关性直方分布图如上所示。')) # 相关性大于0的特征 - positive_corr_features = correlation_matrix[correlation_matrix > 0].sort_values(ascending=False).index.tolist()[1:] - - print(f'{name}下正相关的特征值有:',positive_corr_features) + positive_corr_features = correlation_matrix[correlation_matrix > 0].sort_values( + ascending=False).index.tolist()[1:] + + print(f'{name}下正相关的特征值有:', positive_corr_features) if len(positive_corr_features) > 5: positive_corr_features = positive_corr_features[0:5] - content.append(Graphs.draw_text(f'{name}类指标中,与预测目标y正相关前五的特征有:{positive_corr_features}')) + content.append(Graphs.draw_text( + f'{name}类指标中,与预测目标y正相关前五的特征有:{positive_corr_features}')) draw_feature_trend(feature_data_df, positive_corr_features) elif len(positive_corr_features) == 0: pass else: positive_corr_features = positive_corr_features - content.append(Graphs.draw_text(f'其中,与预测目标y正相关的特征有:{positive_corr_features}')) + content.append(Graphs.draw_text( + f'其中,与预测目标y正相关的特征有:{positive_corr_features}')) draw_feature_trend(feature_data_df, positive_corr_features) - + # 相关性小于0的特征 - negative_corr_features = correlation_matrix[correlation_matrix < 0].sort_values(ascending=True).index.tolist() - - print(f'{name}下负相关的特征值有:',negative_corr_features) + negative_corr_features = correlation_matrix[correlation_matrix < 0].sort_values( + ascending=True).index.tolist() + + print(f'{name}下负相关的特征值有:', negative_corr_features) if len(negative_corr_features) > 5: negative_corr_features = negative_corr_features[:5] - content.append(Graphs.draw_text(f'与预测目标y负相关前五的特征有:{negative_corr_features}')) + content.append(Graphs.draw_text( + f'与预测目标y负相关前五的特征有:{negative_corr_features}')) draw_feature_trend(feature_data_df, negative_corr_features) elif len(negative_corr_features) == 0: pass else: - content.append(Graphs.draw_text(f'{name}类指标中,与预测目标y负相关的特征有:{negative_corr_features}')) + content.append(Graphs.draw_text( + f'{name}类指标中,与预测目标y负相关的特征有:{negative_corr_features}')) draw_feature_trend(feature_data_df, negative_corr_features) # 计算correlation_sum 第一行的相关性的绝对值的总和 correlation_sum = correlation_matrix.abs().sum() logger.info(f'{name}类指标的相关性总和为:{correlation_sum}') # 分组的相关性总和拼接到grouped_corr - goup_corr = pd.DataFrame({'指标分类': [name], '指标数量': [len(cols_subset)], '相关性总和': [correlation_sum]}) - grouped_corr = pd.concat([grouped_corr, goup_corr], axis=0, ignore_index=True) + goup_corr = pd.DataFrame( + {'指标分类': [name], '指标数量': [len(cols_subset)], '相关性总和': [correlation_sum]}) + grouped_corr = pd.concat( + [grouped_corr, goup_corr], axis=0, ignore_index=True) # 绘制相关性总和的气泡图 logger.info(f'开始绘制相关性总和的气泡图') plt.figure(figsize=(10, 10)) - sns.scatterplot(data=grouped_corr, x='相关性总和', y='指标数量', size='相关性总和', sizes=(grouped_corr['相关性总和'].min()*5, grouped_corr['相关性总和'].max()*5), hue='指标分类', palette='viridis') + sns.scatterplot(data=grouped_corr, x='相关性总和', y='指标数量', size='相关性总和', sizes=( + grouped_corr['相关性总和'].min()*5, grouped_corr['相关性总和'].max()*5), hue='指标分类', palette='viridis') plt.title('指标分类相关性总和的气泡图') plt.ylabel('数量') - plt.savefig(os.path.join(dataset, '指标分类相关性总和的气泡图.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '指标分类相关性总和的气泡图.png'), + bbox_inches='tight') plt.close() - content.append(Graphs.draw_img(os.path.join(dataset,'指标分类相关性总和的气泡图.png'))) - content.append(Graphs.draw_text('气泡图中,横轴为指标分类,纵轴为指标分类下的特征数量,气泡的面积越大表示该分类中特征的相关系数和越大。')) + content.append(Graphs.draw_img(os.path.join(dataset, '指标分类相关性总和的气泡图.png'))) + content.append(Graphs.draw_text( + '气泡图中,横轴为指标分类,纵轴为指标分类下的特征数量,气泡的面积越大表示该分类中特征的相关系数和越大。')) logger.info(f'绘制相关性总和的气泡图结束') content.append(Graphs.draw_little_title('模型选择:')) - content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) - ### 读取模型简介 - with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f: + content.append(Graphs.draw_text( + f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) + # 读取模型简介 + with open(os.path.join(dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: for line in f: line_split = line.strip().split('--') if line_split[0] in fivemodels_list: for introduction in line_split: content.append(Graphs.draw_text(introduction)) content.append(Graphs.draw_little_title('模型评估:')) - df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + df = pd.read_csv(os.path.join( + dataset, 'model_evaluation.csv'), encoding='utf-8') # 判断 df 的数值列转为float for col in eval_df.columns: if col not in ['模型(Model)']: @@ -2221,21 +2425,24 @@ def brent_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inpu # # 添加表格 data = eval_df.values.tolist() col_width = 500/len(eval_df.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_text('评估指标释义:')) - content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) - content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) - content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) content.append(Graphs.draw_text('模型拟合:')) # 添加图片 - content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png'))) - ### 生成pdf文件 - doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter) + content.append(Graphs.draw_img(os.path.join(dataset, '预测值与真实值对比图.png'))) + # 生成pdf文件 + doc = SimpleDocTemplate(os.path.join(dataset, reportname), pagesize=letter) doc.build(content) # pdf 上传到数字化信息平台 try: if is_update_report: - with open(os.path.join(dataset,reportname), 'rb') as f: + with open(os.path.join(dataset, reportname), 'rb') as f: base64_data = base64.b64encode(f.read()).decode('utf-8') upload_data["data"]["fileBase64"] = base64_data upload_data["data"]["fileName"] = reportname @@ -2244,88 +2451,96 @@ def brent_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inpu except TimeoutError as e: print(f"请求超时: {e}") + @exception_logger -def pp_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsize=5,dataset='dataset',time = '2024-07-30',reportname='report.pdf',sqlitedb='jbsh_yuanyou.db'): +def pp_export_pdf(num_indicators=475, num_models=21, num_dayindicator=202, inputsize=5, dataset='dataset', time='2024-07-30', reportname='report.pdf', sqlitedb='jbsh_yuanyou.db'): global y # 创建内容对应的空列表 content = list() # 获取特征的近一月值 import pandas as pd - feature_data_df = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'), parse_dates=['ds']).tail(20) - def draw_feature_trend(feature_data_df, features): - # 画特征近一周的趋势图 - feature_df = feature_data_df[['ds','y']+features] - # 遍历X每一列,和yy画散点图 , - - for i, col in enumerate(features): - # try: - print(f'正在绘制第{i+1}个特征{col}与价格散点图...') - if col not in ['ds', 'y']: - fig, ax1 = plt.subplots(figsize=(10, 6)) - # 在第一个坐标轴上绘制数据 - sns.lineplot(data=feature_df, x='ds', y='y', ax=ax1, color='b') - ax1.set_xlabel('日期') - ax1.set_ylabel('y', color='b') - ax1.tick_params('y', colors='b') - # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 - for j in range(1, len(feature_df), 2): - value = feature_df['y'].iloc[j] - date = feature_df['ds'].iloc[j] - offset = 1.001 - ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) - # 创建第二个坐标轴 - ax2 = ax1.twinx() - # 在第二个坐标轴上绘制数据 - sns.lineplot(data=feature_df, x='ds', y=col, ax=ax2, color='r') - ax2.set_ylabel(col, color='r') - ax2.tick_params('y', colors='r') - # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 - for j in range(0, len(feature_df), 2): - value = feature_df[col].iloc[j] - date = feature_df['ds'].iloc[j] - offset = 1.0003 - ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) - # 添加标题 - plt.title(col) - # 设置横坐标为日期格式并自动调整 - locator = mdates.AutoDateLocator() - formatter = mdates.AutoDateFormatter(locator) - ax1.xaxis.set_major_locator(locator) - ax1.xaxis.set_major_formatter(formatter) - # 文件名特殊字符处理 - col = col.replace('*', '-') - col = col.replace(':', '-') - col = col.replace(r'/', '-') - plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) - content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) - plt.close() - # except Exception as e: - # print(f'绘制第{i+1}个特征{col}与价格散点图时出错:{e}') - + feature_data_df = pd.read_csv(os.path.join( + dataset, '指标数据添加时间特征.csv'), parse_dates=['ds']).tail(20) - ### 添加标题 + def draw_feature_trend(feature_data_df, features): + # 画特征近一周的趋势图 + feature_df = feature_data_df[['ds', 'y']+features] + # 遍历X每一列,和yy画散点图 , + + for i, col in enumerate(features): + # try: + print(f'正在绘制第{i+1}个特征{col}与价格散点图...') + if col not in ['ds', 'y']: + fig, ax1 = plt.subplots(figsize=(10, 6)) + # 在第一个坐标轴上绘制数据 + sns.lineplot(data=feature_df, x='ds', y='y', ax=ax1, color='b') + ax1.set_xlabel('日期') + ax1.set_ylabel('y', color='b') + ax1.tick_params('y', colors='b') + # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(1, len(feature_df), 2): + value = feature_df['y'].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax1.text(date, value * offset, str(round(value, 2)), + ha='center', va='bottom', color='b', fontsize=10) + # 创建第二个坐标轴 + ax2 = ax1.twinx() + # 在第二个坐标轴上绘制数据 + sns.lineplot(data=feature_df, x='ds', y=col, ax=ax2, color='r') + ax2.set_ylabel(col, color='r') + ax2.tick_params('y', colors='r') + # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(0, len(feature_df), 2): + value = feature_df[col].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.0003 + ax2.text(date, value * offset, str(round(value, 2)), + ha='center', va='bottom', color='r', fontsize=10) + # 添加标题 + plt.title(col) + # 设置横坐标为日期格式并自动调整 + locator = mdates.AutoDateLocator() + formatter = mdates.AutoDateFormatter(locator) + ax1.xaxis.set_major_locator(locator) + ax1.xaxis.set_major_formatter(formatter) + # 文件名特殊字符处理 + col = col.replace('*', '-') + col = col.replace(':', '-') + col = col.replace(r'/', '-') + plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + content.append(Graphs.draw_img( + os.path.join(dataset, f'{col}与价格散点图.png'))) + plt.close() + # except Exception as e: + # print(f'绘制第{i+1}个特征{col}与价格散点图时出错:{e}') + + # 添加标题 content.append(Graphs.draw_title(f'{y}{time}预测报告')) - ### 预测结果 + # 预测结果 content.append(Graphs.draw_little_title('一、预测结果:')) # 添加历史走势及预测价格的走势图片 - content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png'))) + content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值.png'))) # 取df中y列为空的行 import pandas as pd - df = pd.read_csv(os.path.join(dataset,'predict.csv'),encoding='gbk') - df_true = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 获取预测日期对应的真实值 - df_true = df_true[['ds','y']] - eval_df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + df = pd.read_csv(os.path.join(dataset, 'predict.csv'), encoding='gbk') + df_true = pd.read_csv(os.path.join( + dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 + df_true = df_true[['ds', 'y']] + eval_df = pd.read_csv(os.path.join( + dataset, 'model_evaluation.csv'), encoding='utf-8') # 按评估指标排序,取前五 fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用 # 取 fivemodels_list 和 ds 列 - df = df[['ds'] + fivemodels_list.tolist() ] + df = df[['ds'] + fivemodels_list.tolist()] # 拼接预测日期对应的真实值 df = pd.merge(df, df_true, on='ds', how='left') # 删除全部为nan的列 df = df.dropna(how='all', axis=1) # 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入 - num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])] + num_cols = [col for col in df.columns if col != + 'ds' and pd.api.types.is_numeric_dtype(df[col])] for col in num_cols: df[col] = df[col].astype(float).round(2) # 添加最大值、最小值、平均值三列 @@ -2339,63 +2554,69 @@ def pp_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsi # 添加预测值表格 data = df.values.tolist() col_width = 500/len(df.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) - df = pd.read_csv(os.path.join(dataset,'testandpredict_groupby.csv'),encoding='utf-8') - df4 = df.copy() # 计算偏差率使用 + df = pd.read_csv(os.path.join( + dataset, 'testandpredict_groupby.csv'), encoding='utf-8') + df4 = df.copy() # 计算偏差率使用 # 计算模型偏差率 - #计算各列对于y列的差值百分比 + # 计算各列对于y列的差值百分比 df3 = pd.DataFrame() # 存储偏差率 - + # 删除y列有空值的行 df4 = df4.dropna(subset=['y']) # # 删除有null的行 # df4 = df4.dropna() df3['ds'] = df4['ds'] for col in fivemodels_list: - df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2) + df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100, 2) # 找出决定系数前五的偏差率 df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:] # 找出上一预测区间的时间 stime = df3['ds'].iloc[0] etime = df3['ds'].iloc[-1] # 添加偏差率表格 - fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 - content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) + fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 + content.append(Graphs.draw_text( + f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) # # 添加偏差率表格 df3 = df3.T df3 = df3.reset_index() data = df3.values.tolist() col_width = 500/len(df3.columns) - content.append(Graphs.draw_table(col_width,*data)) - + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('三、预测过程解析:')) - ### 特征、模型、参数配置 + # 特征、模型、参数配置 content.append(Graphs.draw_little_title('模型选择:')) - content.append(Graphs.draw_text(f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) + content.append(Graphs.draw_text( + f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。')) content.append(Graphs.draw_little_title('指标情况:')) - with open(os.path.join(dataset,'特征频度统计.txt'),encoding='utf-8') as f: + with open(os.path.join(dataset, '特征频度统计.txt'), encoding='utf-8') as f: for line in f.readlines(): content.append(Graphs.draw_text(line)) - data = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 计算相关系数用 - df_zhibiaofenlei = loadcsv(os.path.join(dataset,'特征处理后的指标名称及分类.csv')) # 气泡图用 + data = pd.read_csv(os.path.join(dataset, '指标数据添加时间特征.csv'), + encoding='utf-8') # 计算相关系数用 + df_zhibiaofenlei = loadcsv(os.path.join( + dataset, '特征处理后的指标名称及分类.csv')) # 气泡图用 df_zhibiaoshuju = data.copy() # 气泡图用 # 绘制特征相关气泡图 - + grouped = df_zhibiaofenlei.groupby('指标分类') grouped_corr = pd.DataFrame(columns=['指标分类', '指标数量', '相关性总和']) - + content.append(Graphs.draw_little_title('按指标分类分别与预测目标进行皮尔逊相关系数分析:')) - content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) - content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) + content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) + content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) content.append(Graphs.draw_text(''' - 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) - content.append(Graphs.draw_text('''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) - content.append(Graphs.draw_text('''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) + 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) + content.append(Graphs.draw_text( + '''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) + content.append(Graphs.draw_text( + '''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) for name, group in grouped: cols = group['指标名称'].tolist() logger.info(f'开始绘制{name}类指标的相关性直方图') @@ -2404,69 +2625,84 @@ def pp_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsi correlation_matrix = df_zhibiaoshuju[feature_names].corr()['y'] # 绘制特征相关性直方分布图 - plt.figure(figsize=(10,8)) - sns.histplot(correlation_matrix.values.flatten(), bins=20, kde=True, color='skyblue') + plt.figure(figsize=(10, 8)) + sns.histplot(correlation_matrix.values.flatten(), + bins=20, kde=True, color='skyblue') plt.title(f'{name}类指标(共{len(cols_subset)}个)相关性直方分布图') plt.xlabel('相关系数') plt.ylabel('频数') - plt.savefig(os.path.join(dataset, f'{name}类指标相关性直方分布图.png'), bbox_inches='tight') + plt.savefig(os.path.join( + dataset, f'{name}类指标相关性直方分布图.png'), bbox_inches='tight') plt.close() - content.append(Graphs.draw_img(os.path.join(dataset,f'{name}类指标相关性直方分布图.png'))) - content.append(Graphs.draw_text(f'{name}类指标(共{len(cols_subset)}个)的相关性直方分布图如上所示。')) + content.append(Graphs.draw_img( + os.path.join(dataset, f'{name}类指标相关性直方分布图.png'))) + content.append(Graphs.draw_text( + f'{name}类指标(共{len(cols_subset)}个)的相关性直方分布图如上所示。')) # 相关性大于0的特征 - positive_corr_features = correlation_matrix[correlation_matrix > 0].sort_values(ascending=False).index.tolist()[1:] - - print(f'{name}下正相关的特征值有:',positive_corr_features) + positive_corr_features = correlation_matrix[correlation_matrix > 0].sort_values( + ascending=False).index.tolist()[1:] + + print(f'{name}下正相关的特征值有:', positive_corr_features) if len(positive_corr_features) > 5: positive_corr_features = positive_corr_features[0:5] - content.append(Graphs.draw_text(f'{name}类指标中,与预测目标y正相关前五的特征有:{positive_corr_features}')) + content.append(Graphs.draw_text( + f'{name}类指标中,与预测目标y正相关前五的特征有:{positive_corr_features}')) draw_feature_trend(feature_data_df, positive_corr_features) elif len(positive_corr_features) == 0: pass else: positive_corr_features = positive_corr_features - content.append(Graphs.draw_text(f'其中,与预测目标y正相关的特征有:{positive_corr_features}')) + content.append(Graphs.draw_text( + f'其中,与预测目标y正相关的特征有:{positive_corr_features}')) draw_feature_trend(feature_data_df, positive_corr_features) - + # 相关性小于0的特征 - negative_corr_features = correlation_matrix[correlation_matrix < 0].sort_values(ascending=True).index.tolist() - - print(f'{name}下负相关的特征值有:',negative_corr_features) + negative_corr_features = correlation_matrix[correlation_matrix < 0].sort_values( + ascending=True).index.tolist() + + print(f'{name}下负相关的特征值有:', negative_corr_features) if len(negative_corr_features) > 5: negative_corr_features = negative_corr_features[:5] - content.append(Graphs.draw_text(f'与预测目标y负相关前五的特征有:{negative_corr_features}')) + content.append(Graphs.draw_text( + f'与预测目标y负相关前五的特征有:{negative_corr_features}')) draw_feature_trend(feature_data_df, negative_corr_features) elif len(negative_corr_features) == 0: pass else: - content.append(Graphs.draw_text(f'{name}类指标中,与预测目标y负相关的特征有:{negative_corr_features}')) + content.append(Graphs.draw_text( + f'{name}类指标中,与预测目标y负相关的特征有:{negative_corr_features}')) draw_feature_trend(feature_data_df, negative_corr_features) - - + # 计算correlation_sum 第一行的相关性的绝对值的总和 correlation_sum = correlation_matrix.abs().sum() logger.info(f'{name}类指标的相关性总和为:{correlation_sum}') # 分组的相关性总和拼接到grouped_corr - goup_corr = pd.DataFrame({'指标分类': [name], '指标数量': [len(cols_subset)], '相关性总和': [correlation_sum]}) - grouped_corr = pd.concat([grouped_corr, goup_corr], axis=0, ignore_index=True) + goup_corr = pd.DataFrame( + {'指标分类': [name], '指标数量': [len(cols_subset)], '相关性总和': [correlation_sum]}) + grouped_corr = pd.concat( + [grouped_corr, goup_corr], axis=0, ignore_index=True) # 绘制相关性总和的气泡图 logger.info(f'开始绘制相关性总和的气泡图') plt.figure(figsize=(10, 10)) - sns.scatterplot(data=grouped_corr, x='相关性总和', y='指标数量', size='相关性总和', sizes=(grouped_corr['相关性总和'].min()*5, grouped_corr['相关性总和'].max()*5), hue='指标分类', palette='viridis') + sns.scatterplot(data=grouped_corr, x='相关性总和', y='指标数量', size='相关性总和', sizes=( + grouped_corr['相关性总和'].min()*5, grouped_corr['相关性总和'].max()*5), hue='指标分类', palette='viridis') plt.title('指标分类相关性总和的气泡图') plt.ylabel('数量') - plt.savefig(os.path.join(dataset, '指标分类相关性总和的气泡图.png'), bbox_inches='tight') + plt.savefig(os.path.join(dataset, '指标分类相关性总和的气泡图.png'), + bbox_inches='tight') plt.close() - content.append(Graphs.draw_img(os.path.join(dataset,'指标分类相关性总和的气泡图.png'))) - content.append(Graphs.draw_text('气泡图中,横轴为指标分类,纵轴为指标分类下的特征数量,气泡的面积越大表示该分类中特征的相关系数和越大。')) + content.append(Graphs.draw_img(os.path.join(dataset, '指标分类相关性总和的气泡图.png'))) + content.append(Graphs.draw_text( + '气泡图中,横轴为指标分类,纵轴为指标分类下的特征数量,气泡的面积越大表示该分类中特征的相关系数和越大。')) logger.info(f'绘制相关性总和的气泡图结束') - - content.append(Graphs.draw_little_title('模型选择:')) - content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) - ### 读取模型简介 - with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f: + content.append(Graphs.draw_little_title('模型选择:')) + content.append(Graphs.draw_text( + f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) + + # 读取模型简介 + with open(os.path.join(dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: for line in f: line_split = line.strip().split('--') if line_split[0] in fivemodels_list: @@ -2474,8 +2710,9 @@ def pp_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsi content.append(Graphs.draw_text(introduction)) content.append(Graphs.draw_little_title('模型评估:')) - - df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + + df = pd.read_csv(os.path.join( + dataset, 'model_evaluation.csv'), encoding='utf-8') # 判断 df 的数值列转为float for col in eval_df.columns: if col not in ['模型(Model)']: @@ -2488,32 +2725,36 @@ def pp_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsi eval_df = eval_df.T data = eval_df.values.tolist() col_width = 500/len(eval_df.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_text('评估指标释义:')) - content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) - content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) - content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) content.append(Graphs.draw_text('模型拟合:')) - content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png'))) + content.append(Graphs.draw_img(os.path.join(dataset, '预测值与真实值对比图.png'))) # 附1,特征列表 content.append(Graphs.draw_little_title('附1、特征列表:')) - df_fuyi = pd.read_csv(os.path.join(dataset,'特征频度统计.csv'),encoding='utf-8') + df_fuyi = pd.read_csv(os.path.join( + dataset, '特征频度统计.csv'), encoding='utf-8') for col in df_fuyi.columns: fuyi = df_fuyi[col] fuyi = fuyi.dropna() content.append(Graphs.draw_text(f'{col}:')) for i in range(len(fuyi)): content.append(Graphs.draw_text(f'{i+1}、{fuyi[i]}')) - - ### 生成pdf文件 - doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter) + + # 生成pdf文件 + doc = SimpleDocTemplate(os.path.join(dataset, reportname), pagesize=letter) # doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter) doc.build(content) # pdf 上传到数字化信息平台 try: if is_update_report: - with open(os.path.join(dataset,reportname), 'rb') as f: + with open(os.path.join(dataset, reportname), 'rb') as f: base64_data = base64.b64encode(f.read()).decode('utf-8') upload_data["data"]["fileBase64"] = base64_data upload_data["data"]["fileName"] = reportname @@ -2522,15 +2763,16 @@ def pp_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsi except TimeoutError as e: print(f"请求超时: {e}") -def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inputsize=5,dataset='dataset',time = '2024-07-30',reportname='report.pdf'): + +def pp_export_pdf_v1(num_indicators=475, num_models=21, num_dayindicator=202, inputsize=5, dataset='dataset', time='2024-07-30', reportname='report.pdf'): global y # 创建内容对应的空列表 content = list() - ### 添加标题 + # 添加标题 content.append(Graphs.draw_title(f'{y}{time}预测报告')) - - ### 预测结果 + + # 预测结果 content.append(Graphs.draw_little_title('一、预测结果:')) # 添加图片 # 找出后缀是历史价格-预测值.png的图片 @@ -2538,24 +2780,27 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu # imgs = glob.glob(os.path.join(dataset,'*历史价格-预测值.png')) # for img in imgs: # content.append(Graphs.draw_img(img)) - content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png'))) + content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值.png'))) # 取df中y列为空的行 import pandas as pd - df = pd.read_csv(os.path.join(dataset,'predict.csv'),encoding='gbk') - df_true = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 获取预测日期对应的真实值 - df_true = df_true[['ds','y']] - eval_df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + df = pd.read_csv(os.path.join(dataset, 'predict.csv'), encoding='gbk') + df_true = pd.read_csv(os.path.join( + dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 获取预测日期对应的真实值 + df_true = df_true[['ds', 'y']] + eval_df = pd.read_csv(os.path.join( + dataset, 'model_evaluation.csv'), encoding='utf-8') # 按评估指标排序,取前五 fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用 # 取 fivemodels_list 和 ds 列 - df = df[['ds'] + fivemodels_list.tolist() ] + df = df[['ds'] + fivemodels_list.tolist()] # 拼接预测日期对应的真实值 df = pd.merge(df, df_true, on='ds', how='left') # 删除全部为nan的列 df = df.dropna(how='all', axis=1) # 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入 - num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])] + num_cols = [col for col in df.columns if col != + 'ds' and pd.api.types.is_numeric_dtype(df[col])] for col in num_cols: df[col] = df[col].astype(float).round(2) # 添加最大值、最小值、平均值三列 @@ -2569,53 +2814,54 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu # 添加预测值表格 data = df.values.tolist() col_width = 500/len(df.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) - df = pd.read_csv(os.path.join(dataset,'testandpredict_groupby.csv'),encoding='utf-8') - df4 = df.copy() # 计算偏差率使用 + df = pd.read_csv(os.path.join( + dataset, 'testandpredict_groupby.csv'), encoding='utf-8') + df4 = df.copy() # 计算偏差率使用 # 计算模型偏差率 - #计算各列对于y列的差值百分比 + # 计算各列对于y列的差值百分比 df3 = pd.DataFrame() # 存储偏差率 - + # 删除有null的行 df4 = df4.dropna() df3['ds'] = df4['ds'] for col in df.columns: - if col not in ['y','ds','index']: - df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2) + if col not in ['y', 'ds', 'index']: + df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100, 2) # 找出决定系数前五的偏差率 df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:] # 找出上一预测区间的时间 stime = df3['ds'].iloc[0] etime = df3['ds'].iloc[-1] # 添加偏差率表格 - fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 - content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) + fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 + content.append(Graphs.draw_text( + f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) # # 添加偏差率表格 df3 = df3.T df3 = df3.reset_index() data = df3.values.tolist() col_width = 500/len(df3.columns) - content.append(Graphs.draw_table(col_width,*data)) - + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('三、预测过程解析:')) - ### 特征、模型、参数配置 + # 特征、模型、参数配置 content.append(Graphs.draw_little_title('模型选择:')) - content.append(Graphs.draw_text(f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) + content.append(Graphs.draw_text( + f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。')) content.append(Graphs.draw_little_title('指标情况:')) - with open(os.path.join(dataset,'特征频度统计.txt'),encoding='utf-8') as f: + with open(os.path.join(dataset, '特征频度统计.txt'), encoding='utf-8') as f: for line in f.readlines(): content.append(Graphs.draw_text(line)) - - - - ### 特征工程 + + # 特征工程 # 计算特征相关性 # 读取数据 from scipy.stats import spearmanr - data = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') + data = pd.read_csv(os.path.join( + dataset, '指标数据添加时间特征.csv'), encoding='utf-8') # 重命名预测列 data.rename(columns={y: 'y'}, inplace=True) # 修改 data['ds'] = pd.to_datetime(data['ds']) # 修改 @@ -2625,24 +2871,26 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu correlation_df = pd.DataFrame(columns=['Feature', 'Correlation']) # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 DataFrame 中 for col in data.columns: - if col!= 'y': + if col != 'y': pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1] spearman_correlation, _ = spearmanr(data[col], data['y']) - new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)} + new_row = {'Feature': col, 'Pearson_Correlation': round( + pearson_correlation, 3), 'Spearman_Correlation': round(spearman_correlation, 2)} correlation_df = correlation_df._append(new_row, ignore_index=True) # 删除空列 correlation_df.drop('Correlation', axis=1, inplace=True) correlation_df.dropna(inplace=True) - correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False) + correlation_df.to_csv(os.path.join(dataset, '指标相关性分析.csv'), index=False) data = correlation_df['Pearson_Correlation'].values.tolist() # 生成 -1 到 1 的 20 个区间 bins = np.linspace(-1, 1, 21) # 计算每个区间的统计数(这里是区间内数据的数量) - hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) + for i in range(len(bins) - 1)] - #设置画布大小 + # 设置画布大小 plt.figure(figsize=(10, 6)) # 绘制直方图 plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) @@ -2654,12 +2902,12 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png')) plt.close() - - #设置画布大小 + # 设置画布大小 plt.figure(figsize=(10, 6)) data = correlation_df['Spearman_Correlation'].values.tolist() # 计算每个区间的统计数(这里是区间内数据的数量) - hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) + for i in range(len(bins) - 1)] # 绘制直方图 plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) @@ -2672,18 +2920,20 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu plt.close() content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:')) # 皮尔逊正相关 不相关 负相关 的表格 - content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png'))) - content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) - content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) + content.append(Graphs.draw_img(os.path.join(dataset, '皮尔逊相关性系数.png'))) + content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) + content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) content.append(Graphs.draw_text(''' - 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) - content.append(Graphs.draw_text('''当前特征中正相关前十的有:''')) - top10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature'].to_list() + 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) + content.append(Graphs.draw_text('''当前特征中正相关前十的有:''')) + top10_columns = correlation_df.sort_values( + by='Pearson_Correlation', ascending=False).head(10)['Feature'].to_list() top10 = ','.join(top10_columns) - content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text(f'''{top10}''')) # 获取特征的近一月值 - feature_data_df = pd.read_csv(os.path.join(dataset,'填充后的特征数据.csv'), parse_dates=['ds']).tail(20) - feature_df = feature_data_df[['ds','y']+top10_columns] + feature_data_df = pd.read_csv(os.path.join( + dataset, '填充后的特征数据.csv'), parse_dates=['ds']).tail(20) + feature_df = feature_data_df[['ds', 'y']+top10_columns] # feature_df['ds'] = pd.to_datetime(df['ds'], format = '%Y-%m-%d' ) # 遍历X每一列,和yy画散点图 , for i, col in enumerate(feature_df.columns): @@ -2696,11 +2946,12 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu ax1.set_ylabel('y', color='b') ax1.tick_params('y', colors='b') # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 - for j in range(1,len(feature_df),2): + for j in range(1, len(feature_df), 2): value = feature_df['y'].iloc[j] date = feature_df['ds'].iloc[j] offset = 1.001 - ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) + ax1.text(date, value * offset, str(round(value, 2)), + ha='center', va='bottom', color='b', fontsize=10) # 创建第二个坐标轴 ax2 = ax1.twinx() # 在第二个坐标轴上绘制数据 @@ -2708,11 +2959,12 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu ax2.set_ylabel(col, color='r') ax2.tick_params('y', colors='r') # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 - for j in range(0,len(feature_df),2): + for j in range(0, len(feature_df), 2): value = feature_df[col].iloc[j] date = feature_df['ds'].iloc[j] offset = 1.001 - ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) + ax2.text(date, value * offset, str(round(value, 2)), + ha='center', va='bottom', color='r', fontsize=10) # 添加标题 plt.title(col) # 设置横坐标为日期格式并自动调整 @@ -2724,16 +2976,19 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu col = col.replace('*', '-') col = col.replace(':', '-') plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) - content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) + content.append(Graphs.draw_img( + os.path.join(dataset, f'{col}与价格散点图.png'))) plt.close() - content.append(Graphs.draw_text('''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) - content.append(Graphs.draw_text('''当前特征中负相关前十的有:''')) - tail10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=True).head(10)['Feature'].to_list() + content.append(Graphs.draw_text( + '''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) + content.append(Graphs.draw_text('''当前特征中负相关前十的有:''')) + tail10_columns = correlation_df.sort_values( + by='Pearson_Correlation', ascending=True).head(10)['Feature'].to_list() top10 = ','.join(tail10_columns) - content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text(f'''{top10}''')) # 获取特征的近一周值 - feature_df = feature_data_df[['ds','y']+tail10_columns] + feature_df = feature_data_df[['ds', 'y']+tail10_columns] # 遍历X每一列,和yy画散点图 , for i, col in enumerate(feature_df.columns): print(f'正在绘制第{i+1}个特征{col}与价格散点图...') @@ -2746,11 +3001,12 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu ax1.tick_params('y', colors='b') # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 for j in range(len(feature_df)): - if j%2 == 1: + if j % 2 == 1: value = feature_df['y'].iloc[j] date = feature_df['ds'].iloc[j] offset = 1.001 - ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) + ax1.text(date, value * offset, str(round(value, 2)), + ha='center', va='bottom', color='b', fontsize=10) # 创建第二个坐标轴 ax2 = ax1.twinx() # 在第二个坐标轴上绘制数据 @@ -2758,11 +3014,12 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu ax2.set_ylabel(col, color='r') ax2.tick_params('y', colors='r') # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 - for j in range(1,len(feature_df),2): + for j in range(1, len(feature_df), 2): value = feature_df[col].iloc[j] date = feature_df['ds'].iloc[j] offset = 1.001 - ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) + ax2.text(date, value * offset, str(round(value, 2)), + ha='center', va='bottom', color='r', fontsize=10) # 添加标题 plt.title(col) # 设置横坐标为日期格式并自动调整 @@ -2774,30 +3031,37 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu col = col.replace('*', '-') col = col.replace(':', '-') plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) - content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) + content.append(Graphs.draw_img( + os.path.join(dataset, f'{col}与价格散点图.png'))) plt.close() - content.append(Graphs.draw_text('''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) + content.append(Graphs.draw_text( + '''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:')) # 皮尔逊正相关 不相关 负相关 的表格 - content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png'))) - content.append(Graphs.draw_text('斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) + content.append(Graphs.draw_img(os.path.join(dataset, '斯皮尔曼相关性系数.png'))) + content.append(Graphs.draw_text( + '斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。')) content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。')) content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;')) - content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:''')) - top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature']) - content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:''')) + top10 = ','.join(correlation_df.sort_values( + by='Spearman_Correlation', ascending=False).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;')) - content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:''')) - top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature']) - content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:''')) + top10 = ','.join(correlation_df.sort_values( + by='Spearman_Correlation', ascending=True).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。')) - content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。')) + content.append(Graphs.draw_text( + '与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。')) content.append(Graphs.draw_little_title('模型选择:')) - content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) + content.append(Graphs.draw_text( + f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) - ### 读取模型简介 - with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f: + # 读取模型简介 + with open(os.path.join(dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: for line in f: line_split = line.strip().split('--') if line_split[0] in fivemodels_list: @@ -2805,8 +3069,9 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu content.append(Graphs.draw_text(introduction)) content.append(Graphs.draw_little_title('模型评估:')) - - df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + + df = pd.read_csv(os.path.join( + dataset, 'model_evaluation.csv'), encoding='utf-8') # 判断 df 的数值列转为float for col in eval_df.columns: if col not in ['模型(Model)']: @@ -2822,34 +3087,38 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu # # 添加表格 data = eval_df.values.tolist() col_width = 500/len(eval_df.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_text('评估指标释义:')) - content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) - content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) - content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) content.append(Graphs.draw_text('模型拟合:')) # 添加图片 - content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png'))) - + content.append(Graphs.draw_img(os.path.join(dataset, '预测值与真实值对比图.png'))) + # 附1,特征列表 content.append(Graphs.draw_little_title('附1、特征列表:')) - df_fuyi = pd.read_csv(os.path.join(dataset,'特征频度统计.csv'),encoding='utf-8') + df_fuyi = pd.read_csv(os.path.join( + dataset, '特征频度统计.csv'), encoding='utf-8') for col in df_fuyi.columns: fuyi = df_fuyi[col] fuyi = fuyi.dropna() content.append(Graphs.draw_text(f'{col}:')) for i in range(len(fuyi)): content.append(Graphs.draw_text(f'{i+1}、{fuyi[i]}')) - - ### 生成pdf文件 - doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter) + + # 生成pdf文件 + doc = SimpleDocTemplate(os.path.join(dataset, reportname), pagesize=letter) # doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter) doc.build(content) # pdf 上传到数字化信息平台 # 读取pdf并转为base64 try: if is_update_report: - with open(os.path.join(dataset,reportname), 'rb') as f: + with open(os.path.join(dataset, reportname), 'rb') as f: base64_data = base64.b64encode(f.read()).decode('utf-8') upload_data["data"]["fileBase64"] = base64_data upload_data["data"]["fileName"] = reportname @@ -2858,31 +3127,33 @@ def pp_export_pdf_v1(num_indicators=475,num_models=21, num_dayindicator=202,inpu except TimeoutError as e: print(f"请求超时: {e}") + @exception_logger -def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202,inputsize=5,dataset='dataset',y='电碳价格',end_time='2024-07-30',reportname='tansuanli.pdf'): +def tansuanli_export_pdf(num_indicators=475, num_models=22, num_dayindicator=202, inputsize=5, dataset='dataset', y='电碳价格', end_time='2024-07-30', reportname='tansuanli.pdf'): # 创建内容对应的空列表 content = list() - ### 添加标题 + # 添加标题 content.append(Graphs.draw_title(f'{y}{end_time}预测报告')) - ### 预测结果 + # 预测结果 content.append(Graphs.draw_little_title('一、预测结果:')) - content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png'))) + content.append(Graphs.draw_img(os.path.join(dataset, '历史价格-预测值.png'))) # 取df中y列为空的行 from lib.dataread import loadcsv - df = loadcsv(os.path.join(dataset,'predict.csv')) - df_true = loadcsv(os.path.join(dataset,'指标数据添加时间特征.csv')) # 获取预测日期对应的真实值 - df_true = df_true[['ds','y']] - eval_df = loadcsv(os.path.join(dataset,'model_evaluation.csv')) + df = loadcsv(os.path.join(dataset, 'predict.csv')) + df_true = loadcsv(os.path.join(dataset, '指标数据添加时间特征.csv')) # 获取预测日期对应的真实值 + df_true = df_true[['ds', 'y']] + eval_df = loadcsv(os.path.join(dataset, 'model_evaluation.csv')) # 按评估指标排序,取前五 fivemodels_list = eval_df['模型(Model)'].values[:5] # 列表形式,后面当作列名索引使用 # 取 fivemodels_list 和 ds 列 - df = df[['ds'] + fivemodels_list.tolist() ] + df = df[['ds'] + fivemodels_list.tolist()] # 拼接预测日期对应的真实值 df = pd.merge(df, df_true, on='ds', how='left') # 删除全部为nan的列 df = df.dropna(how='all', axis=1) # 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入 - num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])] + num_cols = [col for col in df.columns if col != + 'ds' and pd.api.types.is_numeric_dtype(df[col])] for col in num_cols: df[col] = df[col].astype(float).round(2) # 添加预测每日的最大值、最小值、平均值三列 @@ -2895,7 +3166,8 @@ def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202, max_values = df[num_cols].max(axis=0) min_values = df[num_cols].min(axis=0) # 创建一个新的 DataFrame 来存储统计行 - stats_row = pd.DataFrame([mean_values, max_values, min_values], index=[0,1,2]) + stats_row = pd.DataFrame( + [mean_values, max_values, min_values], index=[0, 1, 2]) stats_row['ds'] = ['平均值', '最大值', '最小值'] # 将统计行添加到原始 DataFrame df = pd.concat([df, stats_row], axis=0) @@ -2908,82 +3180,92 @@ def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202, # 添加预测值表格 data = df.values.tolist() col_width = 500/len(df.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) - df = loadcsv(os.path.join(dataset,'testandpredict_groupby.csv')) - df4 = df.copy() # 计算偏差率使用 + df = loadcsv(os.path.join(dataset, 'testandpredict_groupby.csv')) + df4 = df.copy() # 计算偏差率使用 # 计算模型偏差率 - #计算各列对于y列的差值百分比 + # 计算各列对于y列的差值百分比 df3 = pd.DataFrame() # 存储偏差率 - + # 删除有null的行 df4 = df4.dropna() df3['ds'] = df4['ds'] for col in df.columns: - if col not in ['y','ds','index']: - df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2) + if col not in ['y', 'ds', 'index']: + df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100, 2) # 找出决定系数前五的偏差率 df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:] # 找出上一预测区间的时间 stime = df3['ds'].iloc[0] etime = df3['ds'].iloc[-1] # 添加偏差率表格 - fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 - content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) + fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 + content.append(Graphs.draw_text( + f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) # # 添加偏差率表格 df3 = df3.T df3 = df3.reset_index() df3 = df3.T data = df3.values.tolist() col_width = 500/len(df3.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_little_title('三、预测过程解析:')) - ### 特征、模型、参数配置 - content.append(Graphs.draw_text(f'本次预测使用了给定的28个指标(列名重复的排除后)作为特征,应用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型。')) + # 特征、模型、参数配置 + content.append(Graphs.draw_text( + f'本次预测使用了给定的28个指标(列名重复的排除后)作为特征,应用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型。')) content.append(Graphs.draw_text(f'使用10天的数据预测未来{inputsize}天的数据。')) content.append(Graphs.draw_little_title('指标情况:')) content.append(Graphs.draw_text(' 指标频度包括')) # 添加频度统计表格 - pindu_df = loadcsv(os.path.join(dataset,'特征频度统计.csv')) + pindu_df = loadcsv(os.path.join(dataset, '特征频度统计.csv')) pindu_df.fillna('-', inplace=True) pindu_df = pindu_df.T pindu_df = pindu_df.reset_index() pindu_df = pindu_df.T data = pindu_df.values.tolist() col_width = 500/len(pindu_df.columns) - content.append(Graphs.draw_table(col_width,*data)) - content.append(Graphs.draw_text(f'从指标特征的频度信息来看,月度指标占比最高,而我们需要进行预测的指标为日度的,所以本数据集中月度和周度指标需要进行插值处理。')) + content.append(Graphs.draw_table(col_width, *data)) + content.append(Graphs.draw_text( + f'从指标特征的频度信息来看,月度指标占比最高,而我们需要进行预测的指标为日度的,所以本数据集中月度和周度指标需要进行插值处理。')) content.append(Graphs.draw_text(' 数据特征工程:')) content.append(Graphs.draw_text('1. 数据日期排序,新日期在最后')) content.append(Graphs.draw_text('2. 删除空列,特征数据列没有值,就删除')) content.append(Graphs.draw_text('3. 周度、月度特征填充为日度数据,填充规则:')) - content.append(Graphs.draw_text(' -- 向后填充,举例:假设周五出现一个周度指标数据,那么在这之前的数据用上周五的数据')) - content.append(Graphs.draw_text(' -- 向前填充,举例:采集数据开始日期为2018年1月1日,那么周度数据可能是2018年1月3日,那么3日的数据向前填充,使1日2日都有数值')) + content.append(Graphs.draw_text( + ' -- 向后填充,举例:假设周五出现一个周度指标数据,那么在这之前的数据用上周五的数据')) + content.append(Graphs.draw_text( + ' -- 向前填充,举例:采集数据开始日期为2018年1月1日,那么周度数据可能是2018年1月3日,那么3日的数据向前填充,使1日2日都有数值')) content.append(Graphs.draw_text(f'以上处理其实并不合理,但结合我们想要的结果,我们选择了这种处理方式。')) - content.append(Graphs.draw_text(f'一般来讲,指标数据的频度和预测列是一致的,我们可以考虑预测月度的目标列,不过这样的话,月度数据太少了,不足以用来训练模型。')) - - ### 特征工程 + content.append(Graphs.draw_text( + f'一般来讲,指标数据的频度和预测列是一致的,我们可以考虑预测月度的目标列,不过这样的话,月度数据太少了,不足以用来训练模型。')) + + # 特征工程 # 预测列分析 content.append(Graphs.draw_text(' 电碳价格自相关ACF和偏自相关PACF分析:')) - content.append(Graphs.draw_img(os.path.join(dataset,'指标数据自相关图.png'))) - content.append(Graphs.draw_img(os.path.join(dataset,'指标数据偏自相关图.png'))) + content.append(Graphs.draw_img(os.path.join(dataset, '指标数据自相关图.png'))) + content.append(Graphs.draw_img(os.path.join(dataset, '指标数据偏自相关图.png'))) content.append(Graphs.draw_text(' 解读:')) - content.append(Graphs.draw_text(' 自相关函数的取值范围为 [-1, 1]。正值表示信号在不同时间点之间具有正相关性,负值表示信号具有负相关性,而 0 表示信号在不同时间点之间不相关。 ')) - content.append(Graphs.draw_text(' 偏自相关函数(PACF)则是在控制了中间的滞后项影响后,特定滞后项与当前项的相关性。 ')) - content.append(Graphs.draw_text(' 当前目标列表现出的 ACF 呈现出拖尾的特征,而 PACF 在1个滞后阶数后截尾,这说明目标值适合使用自回归(AR)模型 ')) + content.append(Graphs.draw_text( + ' 自相关函数的取值范围为 [-1, 1]。正值表示信号在不同时间点之间具有正相关性,负值表示信号具有负相关性,而 0 表示信号在不同时间点之间不相关。 ')) + content.append(Graphs.draw_text( + ' 偏自相关函数(PACF)则是在控制了中间的滞后项影响后,特定滞后项与当前项的相关性。 ')) + content.append(Graphs.draw_text( + ' 当前目标列表现出的 ACF 呈现出拖尾的特征,而 PACF 在1个滞后阶数后截尾,这说明目标值适合使用自回归(AR)模型 ')) content.append(Graphs.draw_text(' 数据特征可视化分析:')) # 找出所有后缀为散点图.png的文件 import glob - scatter_files = glob.glob(os.path.join(dataset,'*散点图.png')) + scatter_files = glob.glob(os.path.join(dataset, '*散点图.png')) for file in scatter_files: content.append(Graphs.draw_img(file)) content.append(Graphs.draw_text(' 解读:')) - content.append(Graphs.draw_text(' 观察特征与目标列的散点图,我们可以直观的感受到特征与我们要预测的列没有明显的趋势相关,需要考虑选取的特征合理。 ')) + content.append(Graphs.draw_text( + ' 观察特征与目标列的散点图,我们可以直观的感受到特征与我们要预测的列没有明显的趋势相关,需要考虑选取的特征合理。 ')) content.append(Graphs.draw_text(' 数据特征相关性分析:')) # 计算特征相关性 # 读取数据 from scipy.stats import spearmanr - data = loadcsv(os.path.join(dataset,'指标数据添加时间特征.csv')) + data = loadcsv(os.path.join(dataset, '指标数据添加时间特征.csv')) # 重命名预测列 data.rename(columns={y: 'y'}, inplace=True) # 修改 from lib.tools import dateConvert @@ -2994,22 +3276,24 @@ def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202, correlation_df = pd.DataFrame(columns=['Feature', 'Correlation']) # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 DataFrame 中 for col in data.columns: - if col!= 'y': + if col != 'y': pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1] spearman_correlation, _ = spearmanr(data[col], data['y']) - new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)} + new_row = {'Feature': col, 'Pearson_Correlation': round( + pearson_correlation, 3), 'Spearman_Correlation': round(spearman_correlation, 2)} correlation_df = correlation_df._append(new_row, ignore_index=True) # 删除空列 correlation_df.drop('Correlation', axis=1, inplace=True) correlation_df.dropna(inplace=True) - correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False) + correlation_df.to_csv(os.path.join(dataset, '指标相关性分析.csv'), index=False) data = correlation_df['Pearson_Correlation'].values.tolist() # 生成 -1 到 1 的 20 个区间 bins = np.linspace(-1, 1, 21) # 计算每个区间的统计数(这里是区间内数据的数量) - hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] - #设置画布大小 + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) + for i in range(len(bins) - 1)] + # 设置画布大小 plt.figure(figsize=(10, 6)) # 绘制直方图 plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) @@ -3019,11 +3303,12 @@ def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202, plt.ylabel('统计数') plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png')) plt.close() - #设置画布大小 + # 设置画布大小 plt.figure(figsize=(10, 6)) data = correlation_df['Spearman_Correlation'].values.tolist() # 计算每个区间的统计数(这里是区间内数据的数量) - hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) + for i in range(len(bins) - 1)] # 绘制直方图 plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) # 添加标题和坐标轴标签 @@ -3034,40 +3319,48 @@ def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202, plt.close() content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:')) # 皮尔逊正相关 不相关 负相关 的表格 - content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png'))) - content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) - content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) + content.append(Graphs.draw_img(os.path.join(dataset, '皮尔逊相关性系数.png'))) + content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) + content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) content.append(Graphs.draw_text(''' - 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) - content.append(Graphs.draw_text('''当前特征中正相关前十的有:''')) - top10 = ','.join(correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature']) - content.append(Graphs.draw_text(f'''{top10}''')) - content.append(Graphs.draw_text('''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) - content.append(Graphs.draw_text('''当前特征中负相关前十的有:''')) - top10 = ','.join(correlation_df.sort_values(by='Pearson_Correlation',ascending=True).head(10)['Feature']) - content.append(Graphs.draw_text(f'''{top10}''')) - content.append(Graphs.draw_text('''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) + 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) + content.append(Graphs.draw_text('''当前特征中正相关前十的有:''')) + top10 = ','.join(correlation_df.sort_values( + by='Pearson_Correlation', ascending=False).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text( + '''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) + content.append(Graphs.draw_text('''当前特征中负相关前十的有:''')) + top10 = ','.join(correlation_df.sort_values( + by='Pearson_Correlation', ascending=True).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text( + '''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:')) # 皮尔逊正相关 不相关 负相关 的表格 - content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png'))) - content.append(Graphs.draw_text('斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) + content.append(Graphs.draw_img(os.path.join(dataset, '斯皮尔曼相关性系数.png'))) + content.append(Graphs.draw_text( + '斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。')) content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。')) content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;')) - content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:''')) - top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature']) - content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:''')) + top10 = ','.join(correlation_df.sort_values( + by='Spearman_Correlation', ascending=False).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;')) - content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:''')) - top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature']) - content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:''')) + top10 = ','.join(correlation_df.sort_values( + by='Spearman_Correlation', ascending=True).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。')) - content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。')) + content.append(Graphs.draw_text( + '与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。')) content.append(Graphs.draw_little_title('模型选择:')) content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,模型的简介如下:')) - ### 读取模型简介 - with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f: + # 读取模型简介 + with open(os.path.join(dataset, 'model_introduction.txt'), 'r', encoding='utf-8') as f: for line in f: line_split = line.strip().split('--') # if line_split[0] in fivemodels_list: @@ -3076,7 +3369,7 @@ def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202, content.append(Graphs.draw_little_title('模型评估:')) content.append(Graphs.draw_text(f'通过评估指标MAE从小到大排列,前5个模型的评估详情如下:')) - df = loadcsv(os.path.join(dataset,'model_evaluation.csv')) + df = loadcsv(os.path.join(dataset, 'model_evaluation.csv')) # 判断 df 的数值列转为float for col in eval_df.columns: if col not in ['模型(Model)']: @@ -3092,14 +3385,17 @@ def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202, # # 添加表格 data = eval_df.values.tolist() col_width = 500/len(eval_df.columns) - content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_table(col_width, *data)) content.append(Graphs.draw_text('评估指标释义:')) - content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值的差值的平方,然后对这些平方差求平均值,最后取平均值的平方根。取值越小,误差越小,预测效果越好。')) - content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,对预测值与真实值之间差值的绝对值进行求和,然后除以样本数量。取值越小,误差越小,预测效果越好。')) - content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值之差的平方,然后对这些平方差求平均值。取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值的差值的平方,然后对这些平方差求平均值,最后取平均值的平方根。取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,对预测值与真实值之间差值的绝对值进行求和,然后除以样本数量。取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text( + '3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值之差的平方,然后对这些平方差求平均值。取值越小,误差越小,预测效果越好。')) content.append(Graphs.draw_text('模型拟合:')) # 添加图片 - content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png'))) - ### 生成pdf文件 - doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter) + content.append(Graphs.draw_img(os.path.join(dataset, '预测值与真实值对比图.png'))) + # 生成pdf文件 + doc = SimpleDocTemplate(os.path.join(dataset, reportname), pagesize=letter) doc.build(content)