From c76cebdec6e948fa912c32a448057e9db97d0213 Mon Sep 17 00:00:00 2001 From: workpc Date: Fri, 1 Nov 2024 16:38:21 +0800 Subject: [PATCH] =?UTF-8?q?py=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aa copy.py | 55 ++ aa.py | 10 + config_jingbo.py | 254 ++++++ config_juxiting.py | 297 +++++++ config_tansuanli.py | 106 +++ lib/__init__.py | 0 lib/dataread.py | 1391 ++++++++++++++++++++++++++++++ lib/duojinchengpredict.py | 191 +++++ lib/tools.py | 448 ++++++++++ main.py | 176 ++++ maincanshu.py | 123 +++ models/grumodels.py | 164 ++++ models/lstmmodels.py | 255 ++++++ models/nerulforcastmodels.py | 1519 +++++++++++++++++++++++++++++++++ pushdata.py | 104 +++ 八个维度demo copy.py | 62 ++ 八个维度demo.py | 200 +++++ 原油预测定时任务,请勿关闭.py | 14 + 18 files changed, 5369 insertions(+) create mode 100644 aa copy.py create mode 100644 aa.py create mode 100644 config_jingbo.py create mode 100644 config_juxiting.py create mode 100644 config_tansuanli.py create mode 100644 lib/__init__.py create mode 100644 lib/dataread.py create mode 100644 lib/duojinchengpredict.py create mode 100644 lib/tools.py create mode 100644 main.py create mode 100644 maincanshu.py create mode 100644 models/grumodels.py create mode 100644 models/lstmmodels.py create mode 100644 models/nerulforcastmodels.py create mode 100644 pushdata.py create mode 100644 八个维度demo copy.py create mode 100644 八个维度demo.py create mode 100644 原油预测定时任务,请勿关闭.py diff --git a/aa copy.py b/aa copy.py new file mode 100644 index 0000000..0469e92 --- /dev/null +++ b/aa copy.py @@ -0,0 +1,55 @@ +# 统计特征频度 + +# 读取文件 +import pandas as pd +df = pd.read_csv("D:\code\huarongqiming\碳酸锂合并数据.csv",encoding='gbk') +df['ds'] = pd.to_datetime(df['ds']) +# 按ds正序排序,重置索引 +df = df.sort_values(by='ds', ascending=True).reset_index(drop=True) + +# 统计特征频度 +# 每列随机抽取6个值,计算出5个时间间隔,统计每个时间间隔的频度 +columns = df.columns.to_list() +columns.remove('ds') +count_dict = {} +for column in columns: + # 获取每列时间间隔 + values = df[[column,'ds']] + values.dropna(inplace=True,axis=0) + values=values.reset_index(drop=True) + + # 抽取10个值 + value = values.sample(10) + index = value.index + next_index = index + 1 + count = [] + for i,j in zip(index, next_index): + #通过索引计算日期差 + try: + count.append((values.loc[j,'ds'] - values.loc[i,'ds']).days) + except: + pass + # 把31 换成 30 + count = [30 if i == 31 else i for i in count] + # 保留count中出现次数最多的数 + count = max(set(count), key=count.count) + # 存储到字典中 + count_dict[column] = count + +df = pd.DataFrame(count_dict,index=['count']).T +pindu_dfs = pd.DataFrame() +# 根据count分组 +# 输出特征频度统计 +pindudict = {'1':'日度','7':'周度','30':'月度','90':'季度','180':'半年度','365':'年度'} +for i in df.groupby('count'): + # 获取 i[1] 的索引值 + index = i[1].index + pindu_df = pd.DataFrame() + pindu_df[pindudict[str(i[0])]+f'({len(i[1])})'] = index + # 合并到pindu_dfs + pindu_dfs = pd.concat([pindu_dfs,pindu_df],axis=1) +# nan替换为 ' ' +pindu_dfs = pindu_dfs.fillna('') +pindu_dfs.to_csv('D:\code\huarongqiming\pindu.csv',index=False) +print(pindu_dfs) +print('*'*200) \ No newline at end of file diff --git a/aa.py b/aa.py new file mode 100644 index 0000000..e71435f --- /dev/null +++ b/aa.py @@ -0,0 +1,10 @@ +# 定时执行cmd命令 +import os +import time + +while True: + print(time.strftime('%H:%M')) + # 判断是不是工作日且 是17:00 7:00 才执行 + if time.strftime('%A') not in ['Saturday', 'Sunday'] and time.strftime('%H:%M') in [ '07:00']: + os.system(' D:/ProgramData/anaconda3/python.exe main.py') + time.sleep(60) \ No newline at end of file diff --git a/config_jingbo.py b/config_jingbo.py new file mode 100644 index 0000000..adde041 --- /dev/null +++ b/config_jingbo.py @@ -0,0 +1,254 @@ +import logging +import os +import logging.handlers +import datetime + + +# eta 接口token +APPID = "XNLDvxZHHugj7wJ7" +SECRET = "iSeU4s6cKKBVbt94htVY1p0sqUMqb2xa" + +# eta 接口url +sourcelisturl = 'http://10.189.2.78:8108/v1/edb/source/list' +classifylisturl = 'http://10.189.2.78:8108/v1/edb/classify/list?ClassifyType=' +uniquecodedataurl = 'http://10.189.2.78:8108/v1/edb/data?UniqueCode=4991c37becba464609b409909fe4d992&StartDate=2024-02-01' +classifyidlisturl = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=' +edbcodedataurl = 'http://10.189.2.78:8108/v1/edb/data?EdbCode=' +edbdatapushurl = 'http://10.189.2.78:8108/v1/edb/push' +edbdeleteurl = 'http://10.189.2.78:8108/v1/edb/business/edb/del' +edbbusinessurl = 'http://10.189.2.78:8108/v1/edb/business/data/del' +edbcodelist = ['CO1 Comdty', 'ovx index', 'C2404194834', 'C2404199738', 'dxy curncy', 'C2403128043', 'C2403150124', + 'DOESCRUD Index', 'WTRBM1 EEGC Index', 'FVHCM1 INDEX', 'doedtprd index', 'CFFDQMMN INDEX', + 'C2403083739', 'C2404167878', 'C2403250571', 'lmcads03 lme comdty', 'GC1 COMB Comdty', + 'C2404171822','C2404167855'] + +# 临时写死用指定的列,与上面的edbcode对应,后面更改 +edbnamelist = [ + 'ds','y', + 'Brent c1-c6','Brent c1-c3','Brent-WTI','美国商业原油库存', + 'DFL','美国汽油裂解价差','ovx index','dxy curncy','lmcads03 lme comdty', + 'C2403128043','C2403150124','FVHCM1 INDEX','doedtprd index','CFFDQMMN INDEX', + 'C2403083739','C2404167878', + 'GC1 COMB Comdty','C2404167855' + ] + + + +# eta自有数据指标编码 +modelsindex = { + 'NHITS': 'SELF0000001', + 'Informer':'SELF0000057', + 'LSTM':'SELF0000058', + 'iTransformer':'SELF0000059', + 'TSMixer':'SELF0000060', + 'TSMixerx':'SELF0000061', + 'PatchTST':'SELF0000062', + 'RNN':'SELF0000063', + 'GRU':'SELF0000064', + 'TCN':'SELF0000065', + 'BiTCN':'SELF0000066', + 'DilatedRNN':'SELF0000067', + 'MLP':'SELF0000068', + 'DLinear':'SELF0000069', + 'NLinear':'SELF0000070', + 'TFT':'SELF0000071', + 'FEDformer':'SELF0000072', + 'StemGNN':'SELF0000073', + 'MLPMultivariate':'SELF0000074', + 'TiDE':'SELF0000075', + 'DeepNPTS':'SELF0000076' + } + +# eta 上传预测结果的请求体,后面发起请求的时候更改 model datalist 数据 +data = { + "IndexCode": "", + "IndexName": "价格预测模型", + "Unit": "无", + "Frequency": "日度", + "SourceName": f"价格预测", + "Remark": 'ddd', + "DataList": [ + { + "Date": "2024-05-02", + "Value": 333444 + } + ] + } + +# eta 分类 +# level:3才可以获取到数据,所以需要人工把能源化工下所有的level3级都找到 + # url = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=1214' + #ParentId ":1160, 能源化工 + # ClassifyId ":1214,原油 + #ParentId ":1214,",就是原油下所有的数据。 +ClassifyId = 1214 + + +### 报告上传配置 +# 变量定义--线上环境 +# login_pushreport_url = "http://10.200.32.39/jingbo-api/api/server/login" +# upload_url = "http://10.200.32.39/jingbo-api/api/analysis/reportInfo/researchUploadReportSave" + +# login_data = { +# "data": { +# "account": "api_dev", +# "password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=", +# "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", +# "terminal": "API" +# }, +# "funcModule": "API", +# "funcOperation": "获取token" +# } + + + +# upload_data = { +# "funcModule":'研究报告信息', +# "funcOperation":'上传原油价格预测报告', +# "data":{ +# "ownerAccount":'27663', #报告所属用户账号 27663 - 刘小朋 +# "reportType":'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST +# "fileName": '', #文件名称 +# "fileBase64": '' ,#文件内容base64 +# "categoryNo":'yyjgycbg', # 研究报告分类编码 +# "smartBusinessClassCode":'YCJGYCBG', #分析报告分类编码 +# "reportEmployeeCode":"E40482" ,# 报告人 E40482 - 管理员 0000027663 - 刘小朋 +# "reportDeptCode" :"002000621000", # 报告部门 - 002000621000 SH期货研究部 +# "productGroupCode":"RAW_MATERIAL" # 商品分类 +# } +# } + + + +# # 变量定义--测试环境 +login_pushreport_url = "http://192.168.100.53:8080/jingbo-dev/api/server/login" +upload_url = "http://192.168.100.53:8080/jingbo-dev/api/analysis/reportInfo/researchUploadReportSave" +# upload_url = "http://192.168.100.109:8080/jingbo/api/analysis/reportInfo/researchUploadReportSave" # zhaoqiwei + + +login_data = { + "data": { + "account": "api_test", + "password": "MmVmNzNlOWI0MmY0ZDdjZGUwNzE3ZjFiMDJiZDZjZWU=", + "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", + "terminal": "API" + }, + "funcModule": "API", + "funcOperation": "获取token" +} + +upload_data = { + "funcModule":'研究报告信息', + "funcOperation":'上传原油价格预测报告', + "data":{ + "ownerAccount":'arui', #报告所属用户账号 + "reportType":'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST + "fileName": '2000-40-5-50--100-原油指标数据.xlsx-Brent活跃合约--2024-09-06-15-01-29-预测报告.pdf', #文件名称 + "fileBase64": '' ,#文件内容base64 + "categoryNo":'yyjgycbg', # 研究报告分类编码 + "smartBusinessClassCode":'YCJGYCBG', #分析报告分类编码 + "reportEmployeeCode":"E40116", # 报告人 + "reportDeptCode" :"D0044" ,# 报告部门 + "productGroupCode":"RAW_MATERIAL" # 商品分类 + } +} + + +### 线上开关 +# is_train = True # 是否训练 +# is_debug = False # 是否调试 +# is_eta = True # 是否使用eta接口 +# is_timefurture = True # 是否使用时间特征 +# is_fivemodels = False # 是否使用之前保存的最佳的5个模型 +# is_edbcode = False # 特征使用edbcoding列表中的 +# is_edbnamelist = False # 自定义特征,对应上面的edbnamelist +# is_update_eta = True # 预测结果上传到eta +# is_update_report = True # 是否上传报告 + +### 开关 +is_train = True # 是否训练 +is_debug = False # 是否调试 +is_eta = True # 是否使用eta接口 +is_timefurture = True # 是否使用时间特征 +is_fivemodels = False # 是否使用之前保存的最佳的5个模型 +is_edbcode = False # 特征使用edbcoding列表中的 +is_edbnamelist = False # 自定义特征,对应上面的edbnamelist +is_update_eta = False # 预测结果上传到eta +is_update_report = False # 是否上传报告 + + +# 数据截取日期 +end_time = '' # 数据截取日期 +delweekenday = True +is_corr = False # 特征是否参与滞后领先提升相关系数 +add_kdj = False # 是否添加kdj指标 +if add_kdj and is_edbnamelist: + edbnamelist = edbnamelist+['K','D','J'] +### 模型参数 +y = 'Brent活跃合约' # 原油指标数据的目标变量 +# y = '期货结算价(连续):布伦特原油:前一个观测值' # ineoil的目标变量 +horizon =5 # 预测的步长 +input_size = 40 # 输入序列长度 +train_steps = 50 if is_debug else 1000 # 训练步数,用来限定epoch次数 +val_check_steps = 30 # 评估频率 +early_stop_patience_steps = 5 # 早停的耐心步数 +# --- 交叉验证用的参数 +test_size = 200 # 测试集大小,定义100,后面使用的时候重新赋值 +val_size = test_size # 验证集大小,同测试集大小 + +### 特征筛选用到的参数 +k = 100 # 特征筛选数量,如果是0或者值比特征数量大,代表全部特征 + + + +### 文件 +data_set = '原油指标数据.xlsx' # 数据集文件 +# data_set = 'INE_OIL(1).csv' +### 文件夹 +dataset = 'dataset' # 数据集文件夹 + +# 数据库名称 +db_name = os.path.join(dataset,'jbsh_yuanyou.db') + +settings = f'{input_size}-{horizon}-{train_steps}--{k}-{data_set}-{y}' +# 获取日期时间 +now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # 获取当前日期时间 +reportname = f'Brent原油大模型预测--{now}.pdf' # 报告文件名 +reportname = reportname.replace(':', '-') # 替换冒号 + +### 邮件配置 +username='1321340118@qq.com' +passwd='wgczgyhtyyyyjghi' +# recv=['liurui_test@163.com','52585119@qq.com'] +recv=['liurui_test@163.com'] +title='reportname' +content='brent价格预测报告请看附件' +file=os.path.join(dataset,'reportname') +# file=os.path.join(dataset,'14-7-50--100-原油指标数据.xlsx-Brent连1合约价格--20240731175936-预测报告.pdf') +ssl=True + + +### 日志配置 + +# 创建日志目录(如果不存在) +log_dir = 'logs' +if not os.path.exists(log_dir): + os.makedirs(log_dir) + +# 配置日志记录器 +logger = logging.getLogger('my_logger') +logger.setLevel(logging.INFO) + +# 配置文件处理器,将日志记录到文件 +file_handler = logging.handlers.RotatingFileHandler(os.path.join(log_dir, 'pricepredict.log'), maxBytes=1024 * 1024, backupCount=5) +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + +# 配置控制台处理器,将日志打印到控制台 +console_handler = logging.StreamHandler() +console_handler.setFormatter(logging.Formatter('%(message)s')) + +# 将处理器添加到日志记录器 +logger.addHandler(file_handler) +logger.addHandler(console_handler) + +# logger.info('当前配置:'+settings) \ No newline at end of file diff --git a/config_juxiting.py b/config_juxiting.py new file mode 100644 index 0000000..ad56a94 --- /dev/null +++ b/config_juxiting.py @@ -0,0 +1,297 @@ +import logging +import os +import logging.handlers +import datetime + + +# eta 接口token +APPID = "XNLDvxZHHugj7wJ7" +SECRET = "iSeU4s6cKKBVbt94htVY1p0sqUMqb2xa" + +# eta 接口url +sourcelisturl = 'http://10.189.2.78:8108/v1/edb/source/list' +classifylisturl = 'http://10.189.2.78:8108/v1/edb/classify/list?ClassifyType=' +uniquecodedataurl = 'http://10.189.2.78:8108/v1/edb/data?UniqueCode=4991c37becba464609b409909fe4d992&StartDate=2024-02-01' +classifyidlisturl = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=' +edbcodedataurl = 'http://10.189.2.78:8108/v1/edb/data?EdbCode=' +edbdatapushurl = 'http://10.189.2.78:8108/v1/edb/push' +edbdeleteurl = 'http://10.189.2.78:8108/v1/edb/business/edb/del' +edbbusinessurl = 'http://10.189.2.78:8108/v1/edb/business/data/del' +edbcodelist = ['ID01385938','lmcads03 lme comdty', +'GC1 COMB Comdty', +'C2404171822', +'dxy curncy', +'S5443199 ', +'S5479800', +'S5443108', +'H7358586', +'LC3FM1 INDEX', +'CNY REGN Curncy', +'s0105897', +'M0067419', +'M0066351', +'S0266372', +'S0266438', +'S0266506'] + +# 临时写死用指定的列,与上面的edbcode对应,后面更改 +edbnamelist = [ + 'ds','y', + 'LME铜价', + '黄金连1合约', + 'Brent-WTI', + '美元指数', + '甲醇鲁南价格', + '甲醇太仓港口价格', + '山东丙烯主流价', + '丙烷(山东)', + 'FEI丙烷 M1', + '在岸人民币汇率', + '南华工业品指数', + 'PVC期货主力', + 'PE期货收盘价', +'PP连续-1月', +'PP连续-5月', +'PP连续-9月', + ] + +edbcodenamedict = { +'ID01385938':'PP:拉丝:1102K:市场价:青州:国家能源宁煤(日)', +'lmcads03 lme comdty':'LME铜价', +'GC1 COMB Comdty':'黄金连1合约', +'C2404171822':'Brent-WTI', +'dxy curncy':'美元指数', +'S5443199 ':'甲醇鲁南价格', +'S5479800':'甲醇太仓港口价格', +'S5443108':'山东丙烯主流价', +'H7358586':'丙烷(山东)', +'LC3FM1 INDEX':'FEI丙烷 M1', +'CNY REGN Curncy':'在岸人民币汇率', +'s0105897':'南华工业品指数', +'M0067419':'PVC期货主力', +'M0066351':'PE期货收盘价', +'S0266372':'PP连续-1月', +'S0266438':'PP连续-5月', +'S0266506':'PP连续-9月', + +} + +# eta自有数据指标编码 +modelsindex = { + 'NHITS': 'SELF0000001', + 'Informer':'SELF0000057', + 'LSTM':'SELF0000058', + 'iTransformer':'SELF0000059', + 'TSMixer':'SELF0000060', + 'TSMixerx':'SELF0000061', + 'PatchTST':'SELF0000062', + 'RNN':'SELF0000063', + 'GRU':'SELF0000064', + 'TCN':'SELF0000065', + 'BiTCN':'SELF0000066', + 'DilatedRNN':'SELF0000067', + 'MLP':'SELF0000068', + 'DLinear':'SELF0000069', + 'NLinear':'SELF0000070', + 'TFT':'SELF0000071', + 'FEDformer':'SELF0000072', + 'StemGNN':'SELF0000073', + 'MLPMultivariate':'SELF0000074', + 'TiDE':'SELF0000075', + 'DeepNPTS':'SELF0000076' + } + +# eta 上传预测结果的请求体,后面发起请求的时候更改 model datalist 数据 +data = { + "IndexCode": "", + "IndexName": "价格预测模型", + "Unit": "无", + "Frequency": "日度", + "SourceName": f"价格预测", + "Remark": 'ddd', + "DataList": [ + { + "Date": "2024-05-02", + "Value": 333444 + } + ] + } + +# eta 分类 +# level:3才可以获取到数据,所以需要人工把能源化工下所有的level3级都找到 + # url = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=1214' + #ParentId ":1160, 能源化工 + # ClassifyId ":1214,原油 ,1161 PP + #ParentId ":1214,",就是原油下所有的数据。 +ClassifyId = 1161 + + +### 报告上传配置 +# 变量定义--线上环境 +login_pushreport_url = "http://10.200.32.39/jingbo-api/api/server/login" +upload_url = "http://10.200.32.39/jingbo-api/api/dw/dataValue/pushDataValueList" + + +login_data = { + "data": { + "account": "api_dev", + "password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=", + "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", + "terminal": "API" + }, + "funcModule": "API", + "funcOperation": "获取token" +} + + + +upload_data = { + "funcModule":'研究报告信息', + "funcOperation":'上传原油价格预测报告', + "data":{ + "ownerAccount":'27663', #报告所属用户账号 27663 - 刘小朋 + "reportType":'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST + "fileName": '', #文件名称 + "fileBase64": '' ,#文件内容base64 + "categoryNo":'yyjgycbg', # 研究报告分类编码 + "smartBusinessClassCode":'YCJGYCBG', #分析报告分类编码 + "reportEmployeeCode":"E40482" ,# 报告人 E40482 - 管理员 0000027663 - 刘小朋 + "reportDeptCode" :"002000621000", # 报告部门 - 002000621000 SH期货研究部 + "productGroupCode":"RAW_MATERIAL" # 商品分类 + } +} + + + +# # 变量定义--测试环境 +# login_pushreport_url = "http://192.168.100.53:8080/jingbo-dev/api/server/login" +# upload_url = "http://192.168.100.53:8080/jingbo-dev/api/analysis/reportInfo/researchUploadReportSave" +# # upload_url = "http://192.168.100.109:8080/jingbo/api/analysis/reportInfo/researchUploadReportSave" # zhaoqiwei + + +# login_data = { +# "data": { +# "account": "api_test", +# "password": "MmVmNzNlOWI0MmY0ZDdjZGUwNzE3ZjFiMDJiZDZjZWU=", +# "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", +# "terminal": "API" +# }, +# "funcModule": "API", +# "funcOperation": "获取token" +# } + +# upload_data = { +# "funcModule":'研究报告信息', +# "funcOperation":'上传原油价格预测报告', +# "data":{ +# "ownerAccount":'arui', #报告所属用户账号 +# "reportType":'OIL_PRICE_FORECAST', # 报告类型,固定为OIL_PRICE_FORECAST +# "fileName": '2000-40-5-50--100-原油指标数据.xlsx-Brent活跃合约--2024-09-06-15-01-29-预测报告.pdf', #文件名称 +# "fileBase64": '' ,#文件内容base64 +# "categoryNo":'yyjgycbg', # 研究报告分类编码 +# "smartBusinessClassCode":'YCJGYCBG', #分析报告分类编码 +# "reportEmployeeCode":"E40116", # 报告人 +# "reportDeptCode" :"D0044" ,# 报告部门 +# "productGroupCode":"RAW_MATERIAL" # 商品分类 +# } +# } + + +### 线上开关 +# is_train = True # 是否训练 +# is_debug = False # 是否调试 +# is_eta = True # 是否使用eta接口 +# is_timefurture = True # 是否使用时间特征 +# is_fivemodels = False # 是否使用之前保存的最佳的5个模型 +# is_edbcode = False # 特征使用edbcoding列表中的 +# is_edbnamelist = False # 自定义特征,对应上面的edbnamelist +# is_update_report = True # 是否上传报告 + + +### 开关 +is_train = True # 是否训练 +is_debug = False # 是否调试 +is_eta = True # 是否使用eta接口 +is_timefurture = True # 是否使用时间特征 +is_fivemodels = False # 是否使用之前保存的最佳的5个模型 +is_edbcode = False # 特征使用edbcoding列表中的 +is_edbnamelist = False # 自定义特征,对应上面的edbnamelist +is_update_eta = False # 预测结果上传到eta +is_update_report = False # 是否上传报告 + +# 数据截取日期 +end_time = '' # 数据截取日期 +delweekenday = True +is_corr = False # 特征是否参与滞后领先提升相关系数 +add_kdj = False # 是否添加kdj指标 +if add_kdj and is_edbnamelist: + edbnamelist = edbnamelist+['K','D','J'] +### 模型参数 +y = 'PP:拉丝:1102K:市场价:青州:国家能源宁煤(日)' # 原油指标数据的目标变量 +# y = '期货结算价(连续):布伦特原油:前一个观测值' # ineoil的目标变量 +horizon =5 # 预测的步长 +input_size = 40 # 输入序列长度 +train_steps = 50 if is_debug else 1000 # 训练步数,用来限定epoch次数 +val_check_steps = 30 # 评估频率 +early_stop_patience_steps = 5 # 早停的耐心步数 +# --- 交叉验证用的参数 +test_size = 200 # 测试集大小,定义100,后面使用的时候重新赋值 +val_size = test_size # 验证集大小,同测试集大小 + +### 特征筛选用到的参数 +k = 100 # 特征筛选数量,如果是0或者值比特征数量大,代表全部特征 + + + +### 文件 +data_set = 'PP指标数据.xlsx' # 数据集文件 +# data_set = 'INE_OIL(1).csv' +### 文件夹 +dataset = 'dataset' # 数据集文件夹 + +# 数据库名称 +db_name = os.path.join(dataset,'jbsh_juxiting.db') + + +settings = f'{input_size}-{horizon}-{train_steps}--{k}-{data_set}-{y}' +# 获取日期时间 +now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # 获取当前日期时间 +reportname = f'PP--{now}-预测报告.pdf' # 报告文件名 +reportname = reportname.replace(':', '-') # 替换冒号 + +### 邮件配置 +username='1321340118@qq.com' +passwd='wgczgyhtyyyyjghi' +# recv=['liurui_test@163.com','52585119@qq.com'] +recv=['liurui_test@163.com'] +title=reportname +content=y+'预测报告请看附件' +file=os.path.join(dataset,reportname) +# file=os.path.join(dataset,'14-7-50--100-原油指标数据.xlsx-Brent连1合约价格--20240731175936-预测报告.pdf') +ssl=True + + +### 日志配置 + +# 创建日志目录(如果不存在) +log_dir = 'logs' +if not os.path.exists(log_dir): + os.makedirs(log_dir) + +# 配置日志记录器 +logger = logging.getLogger('my_logger') +logger.setLevel(logging.INFO) + +# 配置文件处理器,将日志记录到文件 +file_handler = logging.handlers.RotatingFileHandler(os.path.join(log_dir, 'pricepredict.log'), maxBytes=1024 * 1024, backupCount=5) +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + +# 配置控制台处理器,将日志打印到控制台 +console_handler = logging.StreamHandler() +console_handler.setFormatter(logging.Formatter('%(message)s')) + +# 将处理器添加到日志记录器 +logger.addHandler(file_handler) +logger.addHandler(console_handler) + +# logger.info('当前配置:'+settings) \ No newline at end of file diff --git a/config_tansuanli.py b/config_tansuanli.py new file mode 100644 index 0000000..128ca34 --- /dev/null +++ b/config_tansuanli.py @@ -0,0 +1,106 @@ +import logging +import os +import logging.handlers + + +# eta 接口token +APPID = "XNLDvxZHHugj7wJ7" +SECRET = "iSeU4s6cKKBVbt94htVY1p0sqUMqb2xa" + +# eta 接口url +sourcelisturl = 'http://10.189.2.78:8108/v1/edb/source/list' +classifylisturl = 'http://10.189.2.78:8108/v1/edb/classify/list?ClassifyType=' +uniquecodedataurl = 'http://10.189.2.78:8108/v1/edb/data?UniqueCode=4991c37becba464609b409909fe4d992&StartDate=2024-02-01' +classifyidlisturl = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=' +edbcodedataurl = 'http://10.189.2.78:8108/v1/edb/data?EdbCode=' +edbcodelist = ['CO1 Comdty', 'ovx index', 'C2404194834', 'C2404199738', 'dxy curncy', 'C2403128043', 'C2403150124', + 'DOESCRUD Index', 'WTRBM1 EEGC Index', 'FVHCM1 INDEX', 'doedtprd index', 'CFFDQMMN INDEX', + 'C2403083739', 'C2404167878', 'C2403250571', 'ovx index', 'lmcads03 lme comdty', 'GC1 COMB Comdty', + 'C2404171822'] + +# 临时写死用指定的列,与上面的edbcode对应,后面更改 +edbnamelist = [ + 'ds','y', + 'Brent c1-c6','Brent c1-c3','Brent-WTI','美国商业原油库存', + 'DFL','美国汽油裂解价差','ovx index','dxy curncy','lmcads03 lme comdty', + 'C2403128043','C2403150124','FVHCM1 INDEX','doedtprd index','CFFDQMMN INDEX', + 'C2403083739','C2404167878', + # 'ovx index', + 'GC1 COMB Comdty' + ] + +# eta 分类 +# level:3才可以获取到数据,所以需要人工把能源化工下所有的level3级都找到 + # url = 'http://10.189.2.78:8108/v1/edb/list?ClassifyId=1214' + #ParentId ":1160, 能源化工 + # ClassifyId ":1214,原油 + #ParentId ":1214,",就是原油下所有的数据。 +ClassifyId = 1214 + +### 开关 +is_train = True # 是否训练 +is_debug = True # 是否调试 +is_eta = False # 是否使用eta接口 +is_timefurture = False # 是否使用时间特征 +is_fivemodels = False # 是否使用之前保存的最佳的5个模型 +is_edbcode = False # 特征使用edbcoding列表中的 +is_edbnamelist = False # 启用指定的edbname,影响特征选择,只有数据集 原油时 才考虑启用 + +# 数据截取日期 +end_time = '2024-07-30' # 数据截取日期 +delweekenday = True # 是否删除周末数据 + +### 模型参数 +y = '电碳价格' # 原油指标数据的目标变量 +horizon =5 # 预测的步长 +input_size = 10 # 输入序列长度 +train_steps = 10 if is_debug else 1000 # 训练步数,用来限定epoch次数 +val_check_steps = 30 # 评估频率 +early_stop_patience_steps = 5 # 早停的耐心步数 +### --- 交叉验证用的参数 +test_size = 100 # 测试集大小,定义100,后面使用的时候重新赋值 +val_size = test_size # 验证集大小,同测试集大小 + +### --- 特征筛选用到的参数 +k = 100 # 特征筛选数量,如果是0或者值比特征数量大,代表全部特征 + +### --- 文件 +data_set = '碳酸锂合并数据.csv' # 数据集文件 +### --- 文件夹 +dataset = 'dataset' # 数据集文件夹 +settings = f'{input_size}-{horizon}-{train_steps}--{k}-{data_set}-{y}' +import datetime +now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # 获取当前日期时间 +reportname = f'{settings}--{now}-预测报告.pdf' # 报告文件名 +reportname = reportname.replace(':', '-') # 替换冒号 + +### 邮件配置 +username='1321340118@qq.com' +passwd='wgczgyhtyyyyjghi' +# recv=['liurui_test@163.com','52585119@qq.com'] +recv=['liurui_test@163.com'] +title='reportname' +content='brent价格预测报告请看附件' +file=os.path.join(dataset,'reportname') +# file=os.path.join(dataset,'14-7-50--100-原油指标数据.xlsx-Brent连1合约价格--20240731175936-预测报告.pdf') +ssl=True + + +### --- 日志配置 +# 创建日志目录(如果不存在) +log_dir = 'logs' +if not os.path.exists(log_dir): + os.makedirs(log_dir) +# 配置日志记录器 +logger = logging.getLogger('my_logger') +logger.setLevel(logging.INFO) +# 配置文件处理器,将日志记录到文件 +file_handler = logging.handlers.RotatingFileHandler(os.path.join(log_dir, 'pricepredict.log'), maxBytes=1024 * 1024, backupCount=5) +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +# 配置控制台处理器,将日志打印到控制台 +console_handler = logging.StreamHandler() +console_handler.setFormatter(logging.Formatter('%(message)s')) +# 将处理器添加到日志记录器 +logger.addHandler(file_handler) +logger.addHandler(console_handler) +logger.info('当前配置:'+settings) diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/dataread.py b/lib/dataread.py new file mode 100644 index 0000000..3605848 --- /dev/null +++ b/lib/dataread.py @@ -0,0 +1,1391 @@ +# 导入模块 +import pandas as pd +import numpy as np +import datetime +import string +import base64 +import requests +import random +import time +import re +import os +import hmac +import hashlib +import json +import torch +torch.set_float32_matmul_precision("high") +import matplotlib.pyplot as plt +#设置plt显示中文 +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + +from datetime import timedelta +# from config_jingbo import * +from config_juxiting import * +from sklearn import metrics +from reportlab.pdfbase import pdfmetrics # 注册字体 +from reportlab.pdfbase.ttfonts import TTFont # 字体类 +from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类 +from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch) +from reportlab.lib.styles import getSampleStyleSheet # 文本样式 +from reportlab.lib import colors # 颜色模块 +from reportlab.graphics.charts.barcharts import VerticalBarChart # 图表类 +from reportlab.graphics.charts.legends import Legend # 图例类 +from reportlab.graphics.shapes import Drawing # 绘图工具 +from reportlab.lib.units import cm # 单位:cm + +# 注册字体(提前准备好字体文件, 如果同一个文件需要多种字体可以注册多个) +pdfmetrics.registerFont(TTFont('SimSun', 'SimSun.ttf')) +#设置plt显示中文 +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + +# 定义函数 +def loadcsv(filename): + # 读取csv文件 + try: + df = pd.read_csv(filename, encoding='utf-8') + except UnicodeDecodeError: + df = pd.read_csv(filename, encoding='gbk') + return df + +def dateConvert(df, datecol='ds'): + # 将date列转换为datetime类型 + try: + df[datecol] = pd.to_datetime(df[datecol],format=r'%Y-%m-%d') + except: + df[datecol] = pd.to_datetime(df[datecol],format=r'%Y/%m/%d') + return df + + +def calculate_kdj(data, n=9): + ''' + 给传进来的df 添加列: 波动率,最高,最低,k ,d ,j + ''' + data = data.sort_values(by='ds', ascending=True) + # 因为没有高开低价格,利用每日波动率模拟当天最高价和最低价 + data['pctchange'] = data['y'].pct_change() + # 收益为0的用0.01 + data['pctchange'] = data['pctchange'].replace(0,0.01) + data.dropna(inplace=True) + # 重置索引 + data.reset_index(drop=True,inplace=True) + data['high'] = data['y']* (1+abs(data['pctchange'])/2) + data['low'] = data['y']* (1-abs(data['pctchange'])/2) + low_list = data['y'].rolling(window=n, min_periods=1).min() + high_list = data['y'].rolling(window=n, min_periods=1).max() + rsv = ((data['y'] - low_list) / (high_list - low_list)) * 100 + k = pd.Series(50, index=data.index) + d = pd.Series(50, index=data.index) + for i in range(1, len(data)): + k[i] = (2/3 * k[i - 1]) + (1/3 * rsv[i]) + d[i] = (2/3 * d[i - 1]) + (1/3 * k[i]) + j = 3 * k - 2 * d + + data['K'] = k + data['D'] = d + data['J'] = j + # 将包含 KDJ 指标的数据保存到新的 CSV 文件 + data.to_csv('stock_data_with_kdj.csv', index=False) + # data = data.dropna() + return data + +# 上传报告 +def get_head_auth_report(): + login_res = requests.post(url=login_pushreport_url, json=login_data, timeout=(3, 5)) + text = json.loads(login_res.text) + if text["status"]: + token = text["data"]["accessToken"] + return token + + +def upload_report_data(token, upload_data): + upload_data = upload_data + headers = {"Authorization": token} + logger.info("报告上传中...") + logger.info(f"token:{token}") + logger.info(f"upload_data:{upload_data}" ) + upload_res = requests.post(url=upload_url, headers=headers, json=upload_data, timeout=(3, 15)) + upload_res = json.loads(upload_res.text) + logger.info(upload_res) + if upload_res: + return upload_res + else: + logger.info("报告上传失败") + return None + + + +# 统计特征频度 +def featurePindu(dataset): + # 读取文件 + df = loadcsv(os.path.join(dataset,'未填充的特征数据.csv')) + df['ds'] = pd.to_datetime(df['ds']) + # 按ds正序排序,重置索引 + df = df.sort_values(by='ds', ascending=True).reset_index(drop=True) + + # 统计特征频度 + # 每列随机抽取10个值,计算出5个时间间隔,统计每个时间间隔的频度 + columns = df.columns.to_list() + columns.remove('ds') + count_dict = {} + for column in columns: + # 获取每列时间间隔 + values = df[[column,'ds']] + values.dropna(inplace=True,axis=0) + values=values.reset_index(drop=True) + + # 抽取20%个值 + value = values.sample(frac=0.2) + index = value.index + next_index = index + 1 + count = [] + for i,j in zip(index, next_index): + #通过索引计算日期差 + try: + count.append((values.loc[j,'ds'] - values.loc[i,'ds']).days) + except: + pass + # 把31 换成 30 + count = [30 if i == 31 else i for i in count] + # 保留count中出现次数最多的数 + try: + count = max(set(count), key=count.count) + except ValueError : + logger.info(f'{column}列数据为空') + continue + # 存储到字典中 + count_dict[column] = count + + df = pd.DataFrame(count_dict,index=['count']).T + pindu_dfs = pd.DataFrame() + # 根据count分组 + # 输出特征频度统计 + pindudict = {'1':'日度','3':'日度','7':'周度','30':'月度','90':'季度','180':'半年度','365':'年度'} + for i in df.groupby('count'): + # 获取 i[1] 的索引值 + index = i[1].index + pindu_df = pd.DataFrame() + try: + pindu_df[pindudict[str(i[0])]+f'({len(i[1])})'] = index + except KeyError : + pindu_df[str(i[0])+f'天({len(i[1])})'] = index + # 合并到pindu_dfs + pindu_dfs = pd.concat([pindu_dfs,pindu_df],axis=1) + # nan替换为 ' ' + pindu_dfs = pindu_dfs.fillna('') + pindu_dfs.to_csv(os.path.join(dataset,'特征频度统计.csv'),index=False) + logger.info(pindu_dfs) + featureInfo = f'特征信息:总共有{len(columns)-2}个' + for i in pindu_dfs.columns: + featureInfo += f',{i}' + + featureInfo += ', 详看 附1、特征列表' + + featureInfo += ''' + 数据特征工程: + 1. 数据日期排序,新日期在最后 + 2. 删除空列,特征数据列没有值,就删除 + 3. 删除近两月不再更新值的指标 + 4. 非日度数据填充为日度数据,填充规则: + -- 向后填充,举例:假设周五出现一个周度指标数据,那么在这之前的数据用上周五的数据 + -- 向前填充,举例:采集数据开始日期为2018年1月1日,那么周度数据可能是2018年1月3日,那么3日的数据向前填充,使1日2日都有数值 + 数据特征相关性分析: + ''' + logger.info(featureInfo) + with open(os.path.join(dataset,'特征频度统计.txt'), 'w', encoding='utf-8') as f: + f.write(featureInfo) + logger.info('*'*200) + + +def featureAnalysis(df,dataset,y): + # 特征筛选 + import matplotlib.pyplot as plt + # 选择特征和标签列 + X = df.drop(['ds', 'y'], axis=1) # 特征集,排除时间戳和标签列 + yy = df['y'] # 标签集 + + # 标签集自相关函数分析 + from statsmodels.graphics.tsaplots import plot_acf + plot_acf(yy, lags=30) + plt.savefig(os.path.join(dataset,'指标数据自相关图.png')) + plt.close() + + # 标签集偏自相关函数分析 + from statsmodels.graphics.tsaplots import plot_pacf + plot_pacf(yy, lags=30) + plt.savefig(os.path.join(dataset,'指标数据偏自相关图.png')) + plt.close() + + # 画 特征与价格散点图 + # 删除所有*散点图.png + for file in os.listdir(dataset): + if file.endswith("散点图.png"): + os.remove(os.path.join(dataset, file)) + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + plt.figure(figsize=(10, 10)) + # 遍历X每一列,和yy画散点图 , + for i, col in enumerate(X.columns): + plt.subplot(2, 2, i%4+1) + plt.scatter(X[col], yy) + plt.xlabel(col) + plt.ylabel(y) + plt.title(col) + if i % 4 == 3 or i == len(X.columns)-1: + plt.tight_layout() + plt.savefig(os.path.join(dataset,f'{i}指标数据特征与价格散点图.png')) + plt.close() + + + +def corr_feature(df): + # 重新命名列名,列名排序,y在第一个 + df.reindex(['y'] + sorted(df.columns.difference(['y']))) + df_test = df.copy() + # 取最后的220行 + df_test = df_test.tail(220) + # 去掉日期列 + df_test = df_test.drop(columns=['ds']) + # 不参与标准化 + df_test_noscaler = df_test.copy() # 滞后处理备份 + df_noscaler = df_test.copy() + # 画出相关性热力图 + df_test.to_csv(os.path.join(dataset,'同步相关性.csv')) + corr = df_test.corr() + # 保存相关系数 + corr.to_csv(os.path.join(dataset,'同步相关性系数.csv')) + # plt.figure(figsize=(10, 10)) + # sns.heatmap(corr, annot=True, cmap='coolwarm') + # plt.savefig('dataset/同步相关性热力图.png') + # plt.show() + + # 读取滞后周期文件,更改特征 + characteristic_period = pd.read_csv('dataset/特征滞后周期.csv',encoding='utf-8') + # 去掉周期为0的行 + characteristic_period = characteristic_period.drop(characteristic_period[characteristic_period['滞后周期'] == 0].index) + for col in df.columns: + # 跳过y列 + if col in ['y']: + continue + # 特征滞后n个周期,计算与y的相关性 + if col in characteristic_period['特征'].values: + # 获取特征对应的周期 + period = characteristic_period[characteristic_period['特征'] == col]['滞后周期'].values[0] + # 滞后处理 + df[col] = df[col].shift(period) + df.to_csv(os.path.join(dataset,'滞后处理后的数据集.csv')) + + + # corr_feture_noscaler = {} # 保存相关性最大的周期 + # 遍历df_test的每一列,计算相关性 + # for col in df_noscaler.columns: + # # 跳过y列 + # if col in ['y']: + # continue + # logger.info('特征:', col) + # # 特征滞后n个周期,计算与y的相关性 + # corr_dict = {} + # try: + # for i in range(0, 200): + # if i == 0: + # df_noscaler[col+'_'+str(i)] = df_noscaler[col] + # else: + # df_noscaler[col+'_'+str(i)] = df_noscaler[col].shift(i) + # corr_dict[col+'_'+str(i)] = abs(df_noscaler[col+'_'+str(i)].corr(df_noscaler['y'])) + # except : + # logger.info('特征:', col, '滑动错误,请查看') + # continue + # 输出相关性最大的特征 + # logger.info(max(corr_dict, key=corr_dict.get), corr_dict[max(corr_dict, key=corr_dict.get)]) + # corr_feture_noscaler[col] = max(corr_dict, key=corr_dict.get).split('_')[-1] + # 画出最相关性最大的特征和y的折线图 + # plt.figure(figsize=(10, 5)) + # plt.plot(df_noscaler[max(corr_dict, key=corr_dict.get)], label=max(corr_dict, key=corr_dict.get)) + # # 设置双坐标轴 + # ax1 = plt.gca() + # ax2 = ax1.twinx() + # ax2.plot(df_noscaler['y'], color='r', label='y') + # plt.legend() + # try: + # plt.savefig('dataset/特征与y的折线图_'+max(corr_dict, key=corr_dict.get)+'.png') + # except : + # # :替换成_ + # plt.savefig('dataset/特征与y的折线图_'+max(corr_dict, key=corr_dict.get).replace(':','_').replace('/','_').replace('(','_').replace(')','_')+'.png') + # plt.close() + # 结果保存到txt文件 + # logger.info('不参与标准化的特征滞后相关性写入txt文件') + # with open('dataset/不参与标准化的特征滞后相关性.txt', 'w') as f: + # for key, value in corr_feture_noscaler.items(): + # f.write('%s:%s\n' % (key, value)) + # 遍历corr_feture_noscaler,更改df + # colnames_noscaler = [] + # for col in corr_feture_noscaler: + # colname = col+'_'+corr_feture_noscaler[col] + # if int(corr_feture_noscaler[col]) == 0: + # continue + # df_test_noscaler[colname] = df_test_noscaler[col].shift(int(corr_feture_noscaler[col])) + # df_test_noscaler = df_test_noscaler.drop(columns=[col]) + # colnames_noscaler.append(colname) + # 去除有空值的行 + # df_test_noscaler = df_test_noscaler.dropna() + # df_test_noscaler.reindex(['y'] + sorted(df_test_noscaler.columns.difference(['y']))) + # df_test_noscaler.to_csv('dataset/不参与标准化的特征滞后相关性.csv', index=False) + # 画出相关性热力图 + # corr = df_test_noscaler.corr() + # 保存相关系数 + # corr.to_csv(os.path.join(dataset,'不参与标准化的特征滞后相关性系数.csv')) + # plt.figure(figsize=(10, 10)) + # sns.heatmap(corr, annot=True, cmap='coolwarm') + # plt.savefig('dataset/不参与标准化的特征滞后相关性热力图.png') + # plt.close() + # # 标准化每列 + # from sklearn.preprocessing import StandardScaler + # scaler = StandardScaler() + # df_test = pd.DataFrame(scaler.fit_transform(df_test), columns=df_test.columns) + # corr_feture = {} # 保存相关性最大的周期 + # # 遍历df_test的每一列,计算相关性 + # for col in df_test.columns: + # # 跳过y列 + # if col == 'y': + # continue + # logger.info('特征:', col) + # # 特征滞后n个周期,计算与y的相关性 + # corr_dict = {} + # try: + # for i in range(0, 200): + # if i == 0: + # df_test[col+'_'+str(i)] = df_test[col] + # else: + # df_test[col+'_'+str(i)] = df_test[col].shift(i) + # corr_dict[col+'_'+str(i)] = abs(df_test[col+'_'+str(i)].corr(df_test['y'])) + # except : + # logger.info('特征:', col, '滑动错误,请查看') + # continue + # # 输出相关性最大的特征 + # logger.info(max(corr_dict, key=corr_dict.get), corr_dict[max(corr_dict, key=corr_dict.get)]) + # corr_feture[col] = max(corr_dict, key=corr_dict.get).split('_')[-1] + + + # # 结果保存到txt文件 + # with open('dataset/标准化的特征滞后相关性.txt', 'w') as f: + # for key, value in corr_feture.items(): + # f.write('%s:%s\n' % (key, value)) + # # 遍历corr_feture,更改df + # colnames = [] + # for col in corr_feture: + # colname = col+'_'+corr_feture[col] + # if int(corr_feture[col]) == 0: + # continue + # df[colname] = df[col].shift(int(corr_feture[col])) + # df = df.drop(columns=[col]) + # colnames.append(colname) + # # 去除有空值的行 + # df = df.dropna() + # df.reindex(['y'] + sorted(df.columns.difference(['y']))) + # df.to_csv('dataset/标准化后的特征滞后相关性.csv', index=False) + # # 画出相关性热力图 + # ds = df['ds'] + # df = df.drop(columns=['ds']) + # corr = df.corr() + # # 保存相关系数 + # corr.to_csv(os.path.join(dataset,'标准化后的特征滞后相关性系数.csv')) + # plt.figure(figsize=(10, 10)) + # sns.heatmap(corr, annot=True, cmap='coolwarm') + # plt.savefig('dataset/标准化后的特征滞后相关性热力图.png') + # plt.show() + # df['ds'] = ds + + # 去除nan值 + df = df.dropna() + return df + + +def calculate_kdj(data, n=9): + ''' + 给传进来的df 添加列: 波动率,最高,最低,k ,d ,j + ''' + data = data.sort_values(by='ds', ascending=True) + # 因为没有高开低价格,利用每日波动率模拟当天最高价和最低价 + data['pctchange'] = data['y'].pct_change() + # 收益为0的用0.01 + data['pctchange'] = data['pctchange'].replace(0,0.01) + data.dropna(inplace=True) + # 重置索引 + data.reset_index(drop=True,inplace=True) + data['high'] = data['y']* (1+abs(data['pctchange'])/2) + data['low'] = data['y']* (1-abs(data['pctchange'])/2) + low_list = data['y'].rolling(window=n, min_periods=1).min() + high_list = data['y'].rolling(window=n, min_periods=1).max() + rsv = ((data['y'] - low_list) / (high_list - low_list)) * 100 + k = pd.Series(50, index=data.index) + d = pd.Series(50, index=data.index) + for i in range(1, len(data)): + k[i] = (2/3 * k[i - 1]) + (1/3 * rsv[i]) + d[i] = (2/3 * d[i - 1]) + (1/3 * k[i]) + j = 3 * k - 2 * d + + data['K'] = k + data['D'] = d + data['J'] = j + # 将包含 KDJ 指标的数据保存到新的 CSV 文件 + data.to_csv('dataset\stock_data_with_kdj.csv', index=False) + # data = data.dropna() + return data + + + +def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y',dataset='dataset',delweekenday=False,add_kdj=False,is_timefurture=False): + df = df_zhibiaoshuju.copy() + if end_time == '': + end_time = datetime.datetime.now().strftime('%Y-%m-%d') + # date转为pddate + df.rename(columns={datecol:'ds'},inplace=True) + + # 重命名预测列 + df.rename(columns={y:'y'},inplace=True) + # 按时间顺序排列 + df.sort_values(by='ds',inplace=True) + df['ds'] = pd.to_datetime(df['ds']) + # 获取2018年到当前日期的数据 + df = df[df['ds'].dt.year >= 2018] + # 获取小于等于当前日期的数据 + df = df[df['ds'] <= end_time] + logger.info(f'删除两月不更新特征前数据量:{df.shape}') + # 去掉近最后数据对应的日期在两月以前的列,删除近2月的数据是常熟的列 + current_date = datetime.datetime.now() + two_months_ago = current_date - timedelta(days=40) + + def check_column(col_name): + if 'ds' in col_name or 'y' in col_name: + return False + df_check_column = df[['ds',col_name]] + df_check_column = df_check_column.dropna() + if len(df_check_column) == 0: + return True + if df_check_column[(df_check_column['ds']>= two_months_ago)].groupby(col_name).ngroups < 2: + return True + corresponding_date = df_check_column.iloc[-1]['ds'] + return corresponding_date < two_months_ago + columns_to_drop = df.columns[df.columns.map(check_column)].tolist() + df = df.drop(columns = columns_to_drop) + + logger.info(f'删除两月不更新特征后数据量:{df.shape}') + + # 删除预测列空值的行 + df = df.dropna(subset=['y']) + logger.info(f'删除预测列为空值的行后数据量:{df.shape}') + df = df.dropna(axis=1, how='all') + logger.info(f'删除全为空值的列后数据量:{df.shape}') + df.to_csv(os.path.join(dataset,'未填充的特征数据.csv'),index=False) + # 去掉指标列表中的columns_to_drop的行 + df_zhibiaoliebiao = df_zhibiaoliebiao[df_zhibiaoliebiao['指标名称'].isin(df.columns.tolist())] + df_zhibiaoliebiao.to_csv(os.path.join(dataset,'特征处理后的指标名称及分类.csv'),index=False) + # 频度分析 + featurePindu(dataset=dataset) + # 向上填充 + df = df.ffill() + # 向下填充 + df = df.bfill() + + # 删除周六日的数据 + if delweekenday: + df = df[df['ds'].dt.weekday < 5] + + if add_kdj: + df = calculate_kdj(df) + + if is_timefurture: + df = addtimecharacteristics(df=df,dataset=dataset) + + featureAnalysis(df,dataset=dataset,y=y) + return df + +def getdata(filename, datecol='date',y='y',dataset='',add_kdj=False,is_timefurture=False,end_time=''): + logger.info('getdata接收:'+filename+' '+datecol+' '+end_time) + # 判断后缀名 csv或excel + if filename.endswith('.csv'): + df = loadcsv(filename) + else: + # 读取excel 指标数据 + df_zhibiaoshuju = pd.read_excel(filename,sheet_name='指标数据') + df_zhibiaoliebiao = pd.read_excel(filename,sheet_name='指标列表') + + # 日期字符串转为datatime + df = datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol,y = y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) + + return df + +# def filter_data(ClassifyName,data): +# ''' +# 指标名称保留规则 +# ''' + +# # 包含 关键词 去除, 返回flase +# if any(keyword in data for keyword in ['运费','检修','波动率','地缘政治','股价', +# '同比','环比','环差','裂差','4WMA','变频','道琼斯','标普500','纳斯达克', +# '四周均值','名占比','残差','DMA', +# '连7-连9','4周平均','4周均值','滚动相关性','日本']): +# return False + +# # 检查需要的特征 +# # 去掉 分析 分类下的数据 +# if ClassifyName == '分析': +# return False + +# # 保留 库存中特殊关键词 +# if ClassifyName == '库存': +# if any(keyword in data for keyword in ['原油' , '美国' ,'全球' ,'中国' ,'富查伊拉','ARA' ]): +# return True +# else: +# pass +# else: +# pass + +# # 去掉 持仓中不是基金的数据 +# if ClassifyName == '持仓': +# if '基金' not in data: +# return False +# else: +# pass +# else: +# pass + +# # 去掉 航班中不是中国、美国 的数据 +# if ClassifyName == '需求': +# if '航班' in data : +# if '中国' in data or '美国' in data : +# return True +# else: +# return False +# else: +# pass +# else: +# pass + +# # 分类为 期货市场,同质性数据取第一个 +# if ClassifyName == '期货市场': +# # 去掉c1-9 以后的 +# if 'c1-c' in data: +# try: +# c = int(data.split('c1-c')[1]) +# except: +# return False +# if c > 9 : +# return False +# else: +# pass + +# else: +# pass + +# # 判断 同质性数据, 字符串开头 +# strstartdict = {'ICE Brent c':"ICE Brent c14", +# 'NYMWX WTI c':"NYMWX WTI c5", +# 'INE SC c':"INE SC c1", +# 'EFS c':"EFS c", +# 'Dubai Swap c':"Dubai Swap c1", +# 'Oman Swap c':"Oman Swap c1", +# 'DME Oman c':"DME Oman c1", +# 'Murban Futures c':"Murban Futures c1", +# 'Dubai连合约价格':'Dubai连1合约价格', +# '美国RBOB期货月份合约价格':'美国RBOB期货2309月份合约价格', +# 'Brent连合约价格':'Brent连1合约价格', +# 'WTI连合约价格':'WTI连1合约价格', +# '布伦特连合约价格':'Brent连1合约价格', +# 'Brent 连合约价格':'Brent连1合约价格', +# 'Dubai连合约价格':'Dubai连1合约价格', +# 'Brent连':'Brent连1合约价格', +# 'brent连':'Brent连1合约价格', +# } +# # 判断名称字符串开头是否在 strstartdict.keys中 +# match = re.match(r'([a-zA-Z\s]+)(\d+)', data) +# if match: +# part1 = match.group(1) +# part2 = match.group(2) +# if part1 in [i for i in strstartdict.keys()]: +# if data == strstartdict[part1]: +# return True +# else: +# return False +# # data = 'Brent 连7合约价格' +# # 判断名称字符串去掉数字后是否在 strstartdict.keys中 +# match = re.findall(r'\D+', data) +# if match : +# if len(match) == 2: +# part1 = match[0] +# part2 = match[1] +# if part1+part2 in [i for i in strstartdict.keys()]: +# if data == strstartdict[part1+part2]: +# return True +# else: +# return False +# else: +# pass +# elif len(match) == 1: +# match = re.findall(r'\D+', data) +# part1 = match[0] + +# if part1 in [i for i in strstartdict.keys()]: +# if data == strstartdict[part1]: +# return True +# else: +# return False +# else: +# pass +# else: +# pass + +# return True + +def sanitize_filename(filename): + # 使用正则表达式替换不合规的字符 + # 这里我们替换为下划线'_',但你可以根据需要选择其他字符 + sanitized = re.sub(r'[\\/*?:"<>|\s]', '_', filename) + # 移除开头的点(在某些系统中,以点开头的文件可能是隐藏的) + sanitized = re.sub(r'^\.', '', sanitized) + # 如果需要,可以添加更多替换规则 + return sanitized + +class BinanceAPI: + ''' + 获取 Binance API 请求头签名 + ''' + def __init__(self, APPID, SECRET): + self.APPID = APPID + self.SECRET = SECRET + self.get_signature() + + # 生成随机字符串作为 nonce + def generate_nonce(self, length=32): + self.nonce = ''.join(random.choices(string.ascii_letters + string.digits, k=length)) + return self.nonce + + # 获取当前时间戳(秒) + def get_timestamp(self): + return int(time.time()) + + # 构建待签名字符串 + def build_sign_str(self): + return f'appid={self.APPID}&nonce={self.nonce}×tamp={self.timestamp}' + + # 使用 HMAC SHA-256 计算签名 + def calculate_signature(self, secret, message): + return base64.urlsafe_b64encode(hmac.new(secret.encode('utf-8'), message.encode('utf-8'), hashlib.sha256).digest()).decode('utf-8') + + def get_signature(self): + # 调用上述方法生成签名 + self.nonce = self.generate_nonce() + self.timestamp = self.get_timestamp() + self.sign_str = self.build_sign_str() + self.signature = self.calculate_signature(self.SECRET, self.sign_str) + # return self.signature +class Graphs: + # 绘制标题 + @staticmethod + def draw_title(title: str): + # 获取所有样式表 + style = getSampleStyleSheet() + # 拿到标题样式 + ct = style['Heading1'] + # 单独设置样式相关属性 + ct.fontName = 'SimSun' # 字体名 + ct.fontSize = 18 # 字体大小 + ct.leading = 50 # 行间距 + ct.textColor = colors.green # 字体颜色 + ct.alignment = 1 # 居中 + ct.bold = True + # 创建标题对应的段落,并且返回 + return Paragraph(title, ct) + + # 绘制小标题 + @staticmethod + def draw_little_title(title: str): + # 获取所有样式表 + style = getSampleStyleSheet() + # 拿到标题样式 + ct = style['Normal'] + # 单独设置样式相关属性 + ct.fontName = 'SimSun' # 字体名 + ct.fontSize = 15 # 字体大小 + ct.leading = 30 # 行间距 + ct.textColor = colors.red # 字体颜色 + # 创建标题对应的段落,并且返回 + return Paragraph(title, ct) + + # 绘制普通段落内容 + @staticmethod + def draw_text(text: str): + # 获取所有样式表 + style = getSampleStyleSheet() + # 获取普通样式 + ct = style['Normal'] + ct.fontName = 'SimSun' + ct.fontSize = 12 + ct.wordWrap = 'CJK' # 设置自动换行 + ct.alignment = 0 # 左对齐 + ct.firstLineIndent = 32 # 第一行开头空格 + ct.leading = 25 + return Paragraph(text, ct) + + # 绘制表格 + @staticmethod + def draw_table(*args): + # 列宽度 + col_width = args[0] + style = [ + ('FONTNAME', (0, 0), (-1, -1), 'SimSun'), # 字体 + ('FONTSIZE', (0, 0), (-1, 0), 12), # 第一行的字体大小 + ('FONTSIZE', (0, 1), (-1, -1), 10), # 第二行到最后一行的字体大小 + ('BACKGROUND', (0, 0), (-1, 0), '#d5dae6'), # 设置第一行背景颜色 + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # 第一行水平居中 + ('ALIGN', (0, 1), (-1, -1), 'LEFT'), # 第二行到最后一行左右左对齐 + ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), # 所有表格上下居中对齐 + ('TEXTCOLOR', (0, 0), (-1, -1), colors.darkslategray), # 设置表格内文字颜色 + ('GRID', (0, 0), (-1, -1), 0.5, colors.grey), # 设置表格框线为grey色,线宽为0.5 + # ('SPAN', (0, 1), (0, 2)), # 合并第一列二三行 + # ('SPAN', (0, 3), (0, 4)), # 合并第一列三四行 + # ('SPAN', (0, 5), (0, 6)), # 合并第一列五六行 + # ('SPAN', (0, 7), (0, 8)), # 合并第一列五六行 + ] + table = Table(args[1:], colWidths=col_width, style=style) + return table + + # 创建图表 + @staticmethod + def draw_bar(bar_data: list, ax: list, items: list): + drawing = Drawing(500, 250) + bc = VerticalBarChart() + bc.x = 45 # 整个图表的x坐标 + bc.y = 45 # 整个图表的y坐标 + bc.height = 200 # 图表的高度 + bc.width = 350 # 图表的宽度 + bc.data = bar_data + bc.strokeColor = colors.black # 顶部和右边轴线的颜色 + bc.valueAxis.valueMin = 5000 # 设置y坐标的最小值 + bc.valueAxis.valueMax = 26000 # 设置y坐标的最大值 + bc.valueAxis.valueStep = 2000 # 设置y坐标的步长 + bc.categoryAxis.labels.dx = 2 + bc.categoryAxis.labels.dy = -8 + bc.categoryAxis.labels.angle = 20 + bc.categoryAxis.categoryNames = ax + + # 图示 + leg = Legend() + leg.fontName = 'SimSun' + leg.alignment = 'right' + leg.boxAnchor = 'ne' + leg.x = 475 # 图例的x坐标 + leg.y = 240 + leg.dxTextSpace = 10 + leg.columnMaximum = 3 + leg.colorNamePairs = items + drawing.add(leg) + drawing.add(bc) + return drawing + + # 绘制图片 + @staticmethod + def draw_img(path): + img = Image(path) # 读取指定路径下的图片 + img.drawWidth = 20*cm # 设置图片的宽度 + img.drawHeight = 10*cm # 设置图片的高度 + return img + +# 定义样式函数 +def style_row(row): + if '周' in row['频度']: + return ['background-color: yellow'] * len(row) + else: + return ['background-color: gray'] * len(row) + + + +class EtaReader(): + def __init__(self,signature,classifylisturl,classifyidlisturl,edbcodedataurl,edbcodelist,edbdatapushurl,edbdeleteurl,edbbusinessurl): + # 获取签名 + self.signature = signature + self.classifylisturl = classifylisturl + self.classifyidlisturl = classifyidlisturl + self.edbcodedataurl = edbcodedataurl + self.edbdatapushurl = edbdatapushurl + self.edbcodelist = edbcodelist + self.edbdeleteurl = edbdeleteurl + self.edbbusinessurl = edbbusinessurl + pass + + def filter_yuanyou_data(self,ClassifyName,data): + ''' + 指标名称保留规则 + ''' + + # 包含 关键词 去除, 返回flase + if any(keyword in data for keyword in ['运费','检修','波动率','地缘政治','股价', + '同比','环比','环差','裂差','4WMA','变频','道琼斯','标普500','纳斯达克', + '四周均值','名占比','残差','DMA', + '连7-连9','4周平均','4周均值','滚动相关性','日本']): + return False + + # 检查需要的特征 + # 去掉 分析 分类下的数据 + if ClassifyName == '分析': + return False + + # 保留 库存中特殊关键词 + if ClassifyName == '库存': + if any(keyword in data for keyword in ['原油' , '美国' ,'全球' ,'中国' ,'富查伊拉','ARA' ]): + return True + else: + pass + else: + pass + + # 去掉 持仓中不是基金的数据 + if ClassifyName == '持仓': + if '基金' not in data: + return False + else: + pass + else: + pass + + # 去掉 航班中不是中国、美国 的数据 + if ClassifyName == '需求': + if '航班' in data : + if '中国' in data or '美国' in data : + return True + else: + return False + else: + pass + else: + pass + + # 分类为 期货市场,同质性数据取第一个 + if ClassifyName == '期货市场': + # 去掉c1-9 以后的 + if 'c1-c' in data: + try: + c = int(data.split('c1-c')[1]) + except: + return False + if c > 9 : + return False + else: + pass + + else: + pass + + # 判断 同质性数据, 字符串开头 + strstartdict = {'ICE Brent c':"ICE Brent c14", + 'NYMWX WTI c':"NYMWX WTI c5", + 'INE SC c':"INE SC c1", + 'EFS c':"EFS c", + 'Dubai Swap c':"Dubai Swap c1", + 'Oman Swap c':"Oman Swap c1", + 'DME Oman c':"DME Oman c1", + 'Murban Futures c':"Murban Futures c1", + 'Dubai连合约价格':'Dubai连1合约价格', + '美国RBOB期货月份合约价格':'美国RBOB期货2309月份合约价格', + 'Brent连合约价格':'Brent连1合约价格', + 'WTI连合约价格':'WTI连1合约价格', + '布伦特连合约价格':'Brent连1合约价格', + 'Brent 连合约价格':'Brent连1合约价格', + 'Dubai连合约价格':'Dubai连1合约价格', + 'Brent连':'Brent连1合约价格', + 'brent连':'Brent连1合约价格', + } + # 判断名称字符串开头是否在 strstartdict.keys中 + match = re.match(r'([a-zA-Z\s]+)(\d+)', data) + if match: + part1 = match.group(1) + part2 = match.group(2) + if part1 in [i for i in strstartdict.keys()]: + if data == strstartdict[part1]: + return True + else: + return False + # data = 'Brent 连7合约价格' + # 判断名称字符串去掉数字后是否在 strstartdict.keys中 + match = re.findall(r'\D+', data) + if match : + if len(match) == 2: + part1 = match[0] + part2 = match[1] + if part1+part2 in [i for i in strstartdict.keys()]: + if data == strstartdict[part1+part2]: + return True + else: + return False + else: + pass + elif len(match) == 1: + match = re.findall(r'\D+', data) + part1 = match[0] + + if part1 in [i for i in strstartdict.keys()]: + if data == strstartdict[part1]: + return True + else: + return False + else: + pass + else: + pass + + return True + + def filter_pp_data(self,ClassifyName,data): + ''' + 指标名称保留规则 + ''' + + # 包含 关键词 去除, 返回flase + # if any(keyword in data for keyword in ['运费','检修','波动率','地缘政治','股价', + # '同比','环比','环差','裂差','4WMA','变频','道琼斯','标普500','纳斯达克', + # '四周均值','名占比','残差','DMA', + # '连7-连9','4周平均','4周均值','滚动相关性','日本']): + # return False + # 包含 关键词 保留, 返回True + if any(keyword in data for keyword in ['拉丝']): + return True + + + + # 检查需要的特征 + # 去掉 期货市场 分类下的数据 + if ClassifyName == '期货市场': + return False + else: + pass + + # 保留 库存 下所有指标 + if ClassifyName == '库存': + return True + else: + pass + + # 保留 进出口 下所有指标 + if ClassifyName == '进出口': + return True + else: + pass + + # 保留 价差 下所有指标 + if ClassifyName == '价差': + return True + else: + pass + + # 保留 供应 下所有指标 + if ClassifyName == '供应': + return True + else: + pass + + + # 保留 需求 下所有指标 + if ClassifyName == '需求': + return True + else: + pass + + + return True + + # 通过edbcode 获取指标数据 + def edbcodegetdata(self,df,EdbCode,EdbName): + # 根据指标id,获取指标数据 + url = self.edbcodedataurl+str(EdbCode) + # 发送GET请求 + response = requests.get(url, headers=self.headers) + + # 检查响应状态码 + if response.status_code == 200: + data = response.json() # 假设接口返回的是JSON数据 + all_data_items = data.get('Data') + # 列表转换为DataFrame + df3 = pd.DataFrame(all_data_items, columns=['DataTime', 'Value', 'UpdateTime']) + # df3 = pd.read_json(all_data_items, orient='records') + + # 去掉UpdateTime 列 + df3 = df3.drop(columns=['UpdateTime']) + # df3.set_index('DataTime') + df3.rename(columns={'Value': EdbName}, inplace=True) + # 将数据存储df1 + df = pd.merge(df, df3, how='outer',on='DataTime',suffixes= ('', '_y')) + return df + + else: + # 请求失败,打印错误信息 + logger.info(f'Error: {response.status_code}, {response.text}') + # 主动抛出异常 + raise Exception(f'Error: {response.status_code}, {response.text}') + + def get_eta_api_yuanyou_data(self,data_set,dataset=''): + + today = datetime.date.today().strftime("%Y-%m-%d") + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + 'appid': self.signature.APPID, # 另一个自定义的header参数 + 'signature': self.signature.signature + } + + # 从列表数据中获取指标名称,判断指标名称频度是否为日 ,如果是,则获取UniqueCode,然后获取指标数据,保存到xlat文件中的sheet表。 + + ''' + df = sheetname 指标列表,存储 指标分类-指标名称-指标id-频度 + df1 = sheetname 指标数据 ,存储 时间-指标名称1-指标名称2... + + ''' + + # 构建新的DataFrame df df1 + df = pd.DataFrame(columns=['指标分类', '指标名称', '指标id', '频度']) + df1 = pd.DataFrame(columns=['DataTime']) + + + # 外网环境无法访问,请确认是否为内网环境 + try: + # 发送GET请求 获取指标分类列表 + response = requests.get(self.classifylisturl, headers=self.headers) + except requests.exceptions.RequestException as e: + raise Exception(f"请求失败,请确认是否为内网环境: {e}","\033[0m") + + # 检查响应状态码 + if response.status_code == 200: + # 获取成功, 处理响应内容 + data = response.json() # 假设接口返回的是JSON数据 + + # 请求成功,处理响应内容 + # logger.info(data.get('Data')) + # 定义你想要保留的固定值 + fixed_value = 1214 + + # 遍历列表,只保留那些'category' key的值为固定值的数据项 + filtered_data = [item for item in data.get('Data') if item.get('ParentId') == fixed_value] + + #然后循环filtered_data去获取list数据,才能获取到想要获取的ClassifyId + n = 0 + for item in filtered_data: + n+= 1 + # if n>50: + # break + ClassifyId = item["ClassifyId"] #分类id,分类下的指标列表接口的请求参数 + ClassifyName = item["ClassifyName"] #分类名称,要保存到df的指标分类列 + # 根据分类id,获取指标列表 + url = self.classifyidlisturl+str(ClassifyId) + response = requests.get(url, headers=self.headers) + if response.status_code == 200: + # logger.info(response.text) + data2 = response.json() + Data = data2.get('Data') + for i in Data: + # s+= 1 + EdbCode = i.get('EdbCode') + EdbName = i.get('EdbName') # 指标名称,要保存到df2的指标名称列,df的指标名称列 + Frequency = i.get('Frequency') # 频度,要保存到df的频度列 + # 频度不是 日 或者 周的 跳过 + if Frequency not in ['日度','周度','日','周']: + continue + + # 判断名称是否需要保存 + isSave = self.filter_yuanyou_data(ClassifyName,EdbName) + if isSave: + # 保存到df + # 保存频度 指标名称 分类 指标id 到 df + df2 = pd.DataFrame({'指标分类': ClassifyName, '指标名称': EdbName, '指标id': EdbCode, '频度': Frequency},index=[0]) + + # df = pd.merge(df, df2, how='outer') + df = pd.concat([df, df2]) + df1 = self.edbcodegetdata(df1,EdbCode,EdbName) + else: + logger.info(f'跳过指标 {EdbName}') + + # 找到列表中不在指标列中的指标id,保存成新的list + new_list = [item for item in self.edbcodelist if item not in df['指标id'].tolist()] + logger.info(new_list) + # 遍历new_list,获取指标数据,保存到df1 + for item in new_list: + logger.info(item) + # 将item 加入到 df['指标id']中 + try: + itemname = edbcodenamedict[item] + except: + itemname = item + + df1 = self.edbcodegetdata(df1,item,itemname) + df = pd.concat([df, pd.DataFrame({'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他'},index=[0])]) + + # 按时间排序 + df1.sort_values('DataTime',inplace=True,ascending=False) + df1.rename(columns={'DataTime': 'date'},inplace=True) + # df1.dropna(inplace=True) + # 去掉大于今天日期的行 + df1 = df1[df1['date'] <= datetime.datetime.now().strftime('%Y-%m-%d')] + logger.info(df1.head()) + # logger.info(f'{df1.head()}') + # 保存到xlsx文件的sheet表 + with pd.ExcelWriter(os.path.join(dataset,data_set)) as file: + df1.to_excel(file, sheet_name='指标数据', index=False) + df.to_excel(file, sheet_name='指标列表', index=False) + + df_zhibiaoshuju = df1.copy() + df_zhibiaoliebiao = df.copy() + return df_zhibiaoshuju,df_zhibiaoliebiao + + def get_eta_api_pp_data(self,data_set,dataset=''): + global ClassifyId + today = datetime.date.today().strftime("%Y-%m-%d") + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + 'appid': self.signature.APPID, # 另一个自定义的header参数 + 'signature': self.signature.signature + } + + # 从列表数据中获取指标名称,判断指标名称频度是否为日 ,如果是,则获取UniqueCode,然后获取指标数据,保存到xlat文件中的sheet表。 + + ''' + df = sheetname 指标列表,存储 指标分类-指标名称-指标id-频度 + df1 = sheetname 指标数据 ,存储 时间-指标名称1-指标名称2... + + ''' + + # 构建新的DataFrame df df1 + df = pd.DataFrame(columns=['指标分类', '指标名称', '指标id', '频度']) + df1 = pd.DataFrame(columns=['DataTime']) + + + # 外网环境无法访问,请确认是否为内网环境 + try: + # 发送GET请求 获取指标分类列表 + response = requests.get(self.classifylisturl, headers=self.headers) + except requests.exceptions.RequestException as e: + raise Exception(f"请求失败,请确认是否为内网环境: {e}","\033[0m") + + # 检查响应状态码 + if response.status_code == 200: + # 获取成功, 处理响应内容 + data = response.json() # 假设接口返回的是JSON数据 + + # 请求成功,处理响应内容 + # logger.info(data.get('Data')) + # 定义你想要保留的固定值 + fixed_value = ClassifyId + + # 遍历列表,只保留那些'category' key的值为固定值的数据项 + filtered_data = [item for item in data.get('Data') if item.get('ParentId') == fixed_value] + + #然后循环filtered_data去获取list数据,才能获取到想要获取的ClassifyId + n = 0 + for item in filtered_data: + n+= 1 + # if n>50: + # break + ClassifyId = item["ClassifyId"] #分类id,分类下的指标列表接口的请求参数 + ClassifyName = item["ClassifyName"] #分类名称,要保存到df的指标分类列 + # 根据分类id,获取指标列表 + url = self.classifyidlisturl+str(ClassifyId) + response = requests.get(url, headers=self.headers) + if response.status_code == 200: + # logger.info(response.text) + data2 = response.json() + Data = data2.get('Data') + for i in Data: + # s+= 1 + EdbCode = i.get('EdbCode') + EdbName = i.get('EdbName') # 指标名称,要保存到df2的指标名称列,df的指标名称列 + Frequency = i.get('Frequency') # 频度,要保存到df的频度列 + # 频度不是 日 或者 周的 跳过 + if Frequency not in ['日度','周度','日','周']: + continue + + # 判断名称是否需要保存 + isSave = self.filter_pp_data(ClassifyName,EdbName) + if isSave: + # 保存到df + # 保存频度 指标名称 分类 指标id 到 df + df2 = pd.DataFrame({'指标分类': ClassifyName, '指标名称': EdbName, '指标id': EdbCode, '频度': Frequency},index=[0]) + + # df = pd.merge(df, df2, how='outer') + df = pd.concat([df, df2]) + df1 = self.edbcodegetdata(df1,EdbCode,EdbName) + else: + logger.info(f'跳过指标 {EdbName}') + + # 找到列表中不在指标列中的指标id,保存成新的list + new_list = [item for item in self.edbcodelist if item not in df['指标id'].tolist()] + logger.info(new_list) + # 遍历new_list,获取指标数据,保存到df1 + for item in new_list: + logger.info(item) + # 将item 加入到 df['指标id']中 + try: + itemname = edbcodenamedict[item] + except: + itemname = item + + df1 = self.edbcodegetdata(df1,item,itemname) + df = pd.concat([df, pd.DataFrame({'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他'},index=[0])]) + + # 按时间排序 + df1.sort_values('DataTime',inplace=True,ascending=False) + df1.rename(columns={'DataTime': 'date'},inplace=True) + # df1.dropna(inplace=True) + # 去掉大于今天日期的行 + df1 = df1[df1['date'] <= datetime.datetime.now().strftime('%Y-%m-%d')] + logger.info(df1.head()) + # logger.info(f'{df1.head()}') + # 保存到xlsx文件的sheet表 + with pd.ExcelWriter(os.path.join(dataset,data_set)) as file: + df1.to_excel(file, sheet_name='指标数据', index=False) + df.to_excel(file, sheet_name='指标列表', index=False) + + df_zhibiaoshuju = df1.copy() + df_zhibiaoliebiao = df.copy() + return df_zhibiaoshuju,df_zhibiaoliebiao + + def push_data(self,data): + + today = datetime.date.today().strftime("%Y-%m-%d") + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + 'appid': self.signature.APPID, # 另一个自定义的header参数 + 'signature': self.signature.signature + } + + # 发送post请求 上传数据 + logger.info('请求参数:',data) + response = requests.post(self.edbdatapushurl, headers=self.headers,data=json.dumps(data)) + + # 检查响应状态码 + if response.status_code == 200: + data = response.json() # 假设接口返回的是JSON数据 + + logger.info('上传成功,响应为:', data) + + else: + # 请求失败,打印错误信息 + logger.info(f'Error: {response.status_code}, {response.text}') + # 主动抛出异常 + raise Exception(f'Error: {response.status_code}, {response.text}') + + def del_zhibiao(self,IndexCodeList): + today = datetime.date.today().strftime("%Y-%m-%d") + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + 'appid': self.signature.APPID, # 另一个自定义的header参数 + 'signature': self.signature.signature + } + + data = { + "IndexCodeList": IndexCodeList #指标编码列表 + } + # 发送post请求 上传数据 + response = requests.post(self.edbdeleteurl, headers=self.headers,data=json.dumps(data)) + + + # 检查响应状态码 + if response.status_code == 200: + data = response.json() # 假设接口返回的是JSON数据 + + logger.info('删除成功,响应为:', data) + + else: + # 请求失败,打印错误信息 + logger.info(f'Error: {response.status_code}, {response.text}') + # 主动抛出异常 + raise Exception(f'Error: {response.status_code}, {response.text}') + + def del_business(self,data): + '''' + 接口地址 + https://console-docs.apipost.cn/preview/fce869601d0be1d9/9a637c2f9ed0c589?target_id=d3cafcbf-a68c-42b3-b105-7bbd0e95a9cd + + 请求体 body + { + "IndexCode": "W001067", //指标编码 + "StartDate": "2020-04-20", //指标需要删除的开始日期(>=),如果开始日期和结束日期相等,那么就是删除该日期 + "EndDate": "2024-05-28" //指标需要删除的结束日期(<=),如果开始日期和结束日期相等,那么就是删除该日期 + } + ''' + today = datetime.date.today().strftime("%Y-%m-%d") + + # 定义你的headers,这里可以包含多个参数 + self.headers = { + 'nonce': self.signature.nonce, # 例如,一个认证令牌 + 'timestamp': str(self.signature.timestamp), # 自定义的header参数 + 'appid': self.signature.APPID, # 另一个自定义的header参数 + 'signature': self.signature.signature + } + + + # 发送post请求 上传数据 + response = requests.post(self.edbbusinessurl, headers=self.headers,data=json.dumps(data)) + + + # 检查响应状态码 + if response.status_code == 200: + data = response.json() # 假设接口返回的是JSON数据 + + logger.info('删除成功,响应为:', data) + + else: + # 请求失败,打印错误信息 + logger.info(f'Error: {response.status_code}, {response.text}') + # 主动抛出异常 + raise Exception(f'Error: {response.status_code}, {response.text}') + + +# 时间特征,年,月,一年的多少天,周几,第几周,第几季度,每月的第几天, 每季度的第几天,是否每月的第一天,是否每月的最后一天,是否每季度的第一天,是否每季度的最后一天,是否每年的第一天,是否每年的最后一天 +def addtimecharacteristics(df,dataset): + """ + 为输入的 DataFrame 添加日期相关信息列 + + 参数: + df (pandas.DataFrame): 包含日期列 'ds' 的 DataFrame + + 返回: + pandas.DataFrame: 添加了相关列的 DataFrame + """ + df['year'] = df['ds'].dt.year + df['month'] = df['ds'].dt.month + df['day'] = df['ds'].dt.day + df['dayofweek'] = df['ds'].dt.dayofweek + df['weekofyear'] = df['ds'].dt.isocalendar().week + df['dayofyear'] = df['ds'].dt.dayofyear + df['quarternum'] = df['ds'].dt.quarter + # 将ds列转换为季度Period对象 + df['quarter'] = df['ds'].dt.to_period('Q') + # 获取每个季度的开始日期 + df['quarter_start'] = df['quarter'].dt.to_timestamp('s') + # 计算每个日期是所在季度的第几天 + df['dayofquarter'] = (df['ds'] - df['quarter_start']).dt.days + 1 + # 是否月初 + df['is_month_start'] = df['ds'].dt.is_month_start.astype(int) + # 是否月末 + df['is_month_end'] = df['ds'].dt.is_month_end.astype(int) + # 是否季度初 + df['is_quarter_start'] = df['ds'].dt.is_quarter_start.astype(int) + # 是否季度末 + df['is_quarter_end'] = df['ds'].dt.is_quarter_end.astype(int) + # 是否年初 + df['is_year_start'] = df['ds'].dt.is_year_start.astype(int) + # 是否年末 + df['is_year_end'] = df['ds'].dt.is_year_end.astype(int) + # 去掉 quarter_start quarter + df.drop(columns=['quarter_start','quarter'],inplace=True) + df.to_csv(os.path.join(dataset,'指标数据添加时间特征.csv'), index=False) + return df diff --git a/lib/duojinchengpredict.py b/lib/duojinchengpredict.py new file mode 100644 index 0000000..65c40c9 --- /dev/null +++ b/lib/duojinchengpredict.py @@ -0,0 +1,191 @@ +import pandas as pd +import re +import os +import pandas as pd + +import multiprocessing +import time +import joblib +import torch + +# 定义函数 +def loadcsv(filename): + try: + df = pd.read_csv(filename, encoding='utf-8') + except UnicodeDecodeError: + df = pd.read_csv(filename, encoding='gbk') + return df + + +def datachuli(df, datecol='date'): + # 删除空列 + df = df.dropna(axis=1, how='all') + # 向上填充 + df.ffill + # 向下填充 + df.bfill + # date转为pddate + df.rename(columns={datecol: 'ds'}, inplace=True) + # 设置ds为pd.datetime + df['ds'] = pd.to_datetime(df['ds']) + # 重命名预测列 + df.rename(columns={'Brent连1合约价格': 'y'}, inplace=True) + + return df + + +def getdata(filename, datecol='date'): + df = loadcsv(filename) + df = datachuli(df, datecol) + return df + + + + + +# 预测函数 +def predict(X_test, nf,result_list): + df_predict = nf.predict(X_test).reset_index() + result_list.append(df_predict.values.tolist()) + return df_predict + + +def testSetPredict(X_test, nf, columns,dataset): + + # 记录开始时间 + start_time = time.time() + + # 计算每个进程处理的样本数 + num_samples = len(X_test) + num_processes = multiprocessing.cpu_count() + samples_per_process = num_samples // num_processes + + manager = multiprocessing.Manager() + result_list = manager.list() # 创建共享的列表 + # 创建进程池 + with multiprocessing.Pool(num_processes) as pool: + processes = [] + for i in range(num_processes): + # 计算 每个进程需要处理的数据索引 + start_index = i * samples_per_process + end_index = (i + 1) * samples_per_process if i != num_processes - 1 else num_samples + # 按计算的索引切分数据 + X_test_split = X_test[start_index:end_index] + # 添加任务到进程池 + for X in X_test_split: + processes.append(pool.apply_async(predict, args=(X, nf,result_list))) + for process in processes: + process.get() + # 将共享列表中的数据转换回 DataFrame + df_combined = pd.DataFrame() + df_combined2 = pd.DataFrame() + for result in result_list: + try: + df_shared = pd.DataFrame(result, columns=['index', 'ds'] + columns) + df_combined = pd.concat([df_combined, df_shared]).reset_index(drop=True) + except ValueError: + # 如果数据不匹配,就放到另一个 DataFrame 中 + df_shared2 = pd.DataFrame(result, columns=['index', 'ds']+ columns2) + df_combined2 = pd.concat([df_combined2, df_shared2]).reset_index(drop=True) + # df_combined.drop(['index'], axis=1, inplace=True) + df_combined.to_csv(os.path.join(dataset, 'df_combined.csv'), index=False) + # df_combined2.drop(['index'], axis=1, inplace=True) + df_combined2.to_csv('df_combined.csv', index=False) + end_time = time.time() + # 打印运行时间,转为时分秒 + print("运行时间:", end_time - start_time, "秒") + + +if __name__ == '__main__': + # 记录开始时间 + start_time = time.time() + + + # file = '指标数据处理.csv' + file = 'brentpricepredict.csv' + df = getdata(file) + df.head() + + # 选择特征和标签列 + X = df.drop(['y', 'ds'], axis=1) # 特征集,排除时间戳和标签列 Brent连1合约价格 + y = df['y'] # 标签集 + + # 计算训练集的结束索引,占总数据的80% + split_index = int(0.8 * df.shape[0]) + + # 按照时间顺序划分训练集和测试集 + df_train = df[:split_index] + df_test = df[split_index:] + df_train['unique_id'] = 1 + df_test['unique_id'] = 1 + + df_combined = pd.DataFrame() + df_test = df_test.reindex() + # df_test = df_test[-20:] + + # 读取模型列表,用来预测结果列名 + columns = [ + 'NHITS', + 'Informer', + 'LSTM', + 'iTransformer', + 'TSMixer', + 'TSMixerx', + 'PatchTST', + 'RNN', + 'GRU', + 'TCN', + 'DeepAR', + 'BiTCN', + 'DilatedRNN', + 'MLP', + 'DLinear', + 'NLinear', + 'TFT', + 'FEDformer', + 'StemGNN', + 'MLPMultivariate', + 'TiDE', + 'DeepNPTS', + ] + + # deepar 的预测结果会多 五个列,需要单独处理 + columns2 = [ + 'NHITS', + 'Informer', + 'LSTM', + 'iTransformer', + 'TSMixer', + 'TSMixerx', + 'PatchTST', + 'RNN', + 'GRU', + 'TCN', + 'DeepAR', + 'DeepAR-median', + 'DeepAR-lo-90', + 'DeepAR-lo-80', + 'DeepAR-hi-80', + 'DeepAR-hi-90', + 'BiTCN', + 'DilatedRNN', + 'MLP', + 'DLinear', + 'NLinear', + 'TFT', + 'FEDformer', + 'StemGNN', + 'MLPMultivariate', + 'TiDE', + 'DeepNPT', + ] + + + input_size = 14 + X_test = [] + for i in range(0, len(df_test) - input_size + 1): + X_test.append(df_test.iloc[i:i + input_size]) + + nf = joblib.load('model_reg.joblib') + + testSetPredict(X_test, nf, columns) \ No newline at end of file diff --git a/lib/tools.py b/lib/tools.py new file mode 100644 index 0000000..61024d2 --- /dev/null +++ b/lib/tools.py @@ -0,0 +1,448 @@ +import time +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from config_jingbo import logger +from sklearn import metrics +import random, string, base64, hmac, hashlib +from reportlab.pdfbase import pdfmetrics # 注册字体 +from reportlab.pdfbase.ttfonts import TTFont # 字体类 +from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类 +from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch) +from reportlab.lib.styles import getSampleStyleSheet # 文本样式 +from reportlab.lib import colors # 颜色模块 +from reportlab.graphics.charts.barcharts import VerticalBarChart # 图表类 +from reportlab.graphics.charts.legends import Legend # 图例类 +from reportlab.graphics.shapes import Drawing # 绘图工具 +from reportlab.lib.units import cm # 单位:cm +import smtplib +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +import sqlite3 +import tkinter as tk +from tkinter import messagebox + + +def timeit(func): + '''计时装饰器''' + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + execution_time = end_time - start_time + logger.info(f"{func.__name__} 函数的执行时间为: {execution_time} 秒") + return result + return wrapper + +class BinanceAPI: + ''' + 获取 Binance API 请求头签名 + ''' + def __init__(self, APPID, SECRET): + self.APPID = APPID + self.SECRET = SECRET + self.get_signature() + + # 生成随机字符串作为 nonce + def generate_nonce(self, length=32): + self.nonce = ''.join(random.choices(string.ascii_letters + string.digits, k=length)) + return self.nonce + + # 获取当前时间戳(秒) + def get_timestamp(self): + return int(time.time()) + + # 构建待签名字符串 + def build_sign_str(self): + return f'appid={self.APPID}&nonce={self.nonce}×tamp={self.timestamp}' + + # 使用 HMAC SHA-256 计算签名 + def calculate_signature(self, secret, message): + return base64.urlsafe_b64encode(hmac.new(secret.encode('utf-8'), message.encode('utf-8'), hashlib.sha256).digest()).decode('utf-8') + + def get_signature(self): + # 调用上述方法生成签名 + self.nonce = self.generate_nonce() + self.timestamp = self.get_timestamp() + self.sign_str = self.build_sign_str() + self.signature = self.calculate_signature(self.SECRET, self.sign_str) + # return self.signature + + +class Graphs: + ''' + pdf生成类 + ''' + # 绘制标题 + @staticmethod + def draw_title(title: str): + # 获取所有样式表 + style = getSampleStyleSheet() + # 拿到标题样式 + ct = style['Heading1'] + # 单独设置样式相关属性 + ct.fontName = 'SimSun' # 字体名 + ct.fontSize = 18 # 字体大小 + ct.leading = 50 # 行间距 + ct.textColor = colors.green # 字体颜色 + ct.alignment = 1 # 居中 + ct.bold = True + # 创建标题对应的段落,并且返回 + return Paragraph(title, ct) + + # 绘制小标题 + @staticmethod + def draw_little_title(title: str): + # 获取所有样式表 + style = getSampleStyleSheet() + # 拿到标题样式 + ct = style['Normal'] + # 单独设置样式相关属性 + ct.fontName = 'SimSun' # 字体名 + ct.fontSize = 15 # 字体大小 + ct.leading = 30 # 行间距 + ct.textColor = colors.red # 字体颜色 + # 创建标题对应的段落,并且返回 + return Paragraph(title, ct) + + # 绘制普通段落内容 + @staticmethod + def draw_text(text: str): + # 获取所有样式表 + style = getSampleStyleSheet() + # 获取普通样式 + ct = style['Normal'] + ct.fontName = 'SimSun' + ct.fontSize = 12 + ct.wordWrap = 'CJK' # 设置自动换行 + ct.alignment = 0 # 左对齐 + ct.firstLineIndent = 32 # 第一行开头空格 + ct.leading = 25 + return Paragraph(text, ct) + + # 绘制表格 + @staticmethod + def draw_table(col_width,*args): + # 列宽度 + col_width = col_width + style = [ + ('FONTNAME', (0, 0), (-1, -1), 'SimSun'), # 字体 + ('FONTSIZE', (0, 0), (-1, 0), 10), # 第一行的字体大小 + ('FONTSIZE', (0, 1), (-1, -1), 8), # 第二行到最后一行的字体大小 + ('BACKGROUND', (0, 0), (-1, 0), '#d5dae6'), # 设置第一行背景颜色 + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # 第一行水平居中 + ('ALIGN', (0, 1), (-1, -1), 'LEFT'), # 第二行到最后一行左右左对齐 + ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), # 所有表格上下居中对齐 + ('TEXTCOLOR', (0, 0), (-1, -1), colors.darkslategray), # 设置表格内文字颜色 + ('GRID', (0, 0), (-1, -1), 0.5, colors.grey), # 设置表格框线为grey色,线宽为0.5 + # ('SPAN', (0, 1), (0, 2)), # 合并第一列二三行 + # ('SPAN', (0, 3), (0, 4)), # 合并第一列三四行 + # ('SPAN', (0, 5), (0, 6)), # 合并第一列五六行 + # ('SPAN', (0, 7), (0, 8)), # 合并第一列五六行 + ] + table = Table(args, colWidths=col_width, style=style) + return table + + # 创建图表 + @staticmethod + def draw_bar(bar_data: list, ax: list, items: list): + drawing = Drawing(500, 250) + bc = VerticalBarChart() + bc.x = 45 # 整个图表的x坐标 + bc.y = 45 # 整个图表的y坐标 + bc.height = 200 # 图表的高度 + bc.width = 350 # 图表的宽度 + bc.data = bar_data + bc.strokeColor = colors.black # 顶部和右边轴线的颜色 + bc.valueAxis.valueMin = 5000 # 设置y坐标的最小值 + bc.valueAxis.valueMax = 26000 # 设置y坐标的最大值 + bc.valueAxis.valueStep = 2000 # 设置y坐标的步长 + bc.categoryAxis.labels.dx = 2 + bc.categoryAxis.labels.dy = -8 + bc.categoryAxis.labels.angle = 20 + bc.categoryAxis.categoryNames = ax + + # 图示 + leg = Legend() + leg.fontName = 'SimSun' + leg.alignment = 'right' + leg.boxAnchor = 'ne' + leg.x = 475 # 图例的x坐标 + leg.y = 240 + leg.dxTextSpace = 10 + leg.columnMaximum = 3 + leg.colorNamePairs = items + drawing.add(leg) + drawing.add(bc) + return drawing + + # 绘制图片 + @staticmethod + def draw_img(path): + img = Image(path) # 读取指定路径下的图片 + img.drawWidth = 20*cm # 设置图片的宽度 + img.drawHeight = 10*cm # 设置图片的高度 + return img + + +# 评估指标不在一个库,这里列出所有用到的指标的公式 + +# MSE +def mse(y_true, y_pred): + + res_mse = metrics.mean_squared_error(y_true, y_pred) + + return res_mse +# RMSE +def rmse(y_true, y_pred): + + res_rmse = np.sqrt(metrics.mean_squared_error(y_true, y_pred)) + + return res_rmse + +# MAE +def mae(y_true, y_pred): + + res_mae = metrics.mean_absolute_error(y_true, y_pred) + + return res_mae + +# sklearn的库中没有MAPE和SMAPE,下面根据公式给出算法实现 +# MAPE +def mape(y_true, y_pred): + + res_mape = np.mean(np.abs((y_pred - y_true) / y_true)) * 100 + + return res_mape + +# SMAPE +def smape(y_true, y_pred): + + res_smape = 2.0 * np.mean(np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))) * 100 + + return res_smape + +# 相关系数绘制 +def plot_corr(data, size=11): + # 去掉ds列 + data.drop(columns=['ds'], inplace=True) + + # 创建一个空的 DataFrame 来保存相关系数 + correlation_df = pd.DataFrame(columns=['Feature', 'Correlation']) + + # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 DataFrame 中 + for col in data.columns: + if col!= 'y': + pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1] + spearman_correlation, _ = spearmanr(data[col], data['y']) + new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)} + correlation_df = correlation_df._append(new_row, ignore_index=True) + # 删除空列 + correlation_df.drop('Correlation', axis=1, inplace=True) + correlation_df.dropna(inplace=True) + correlation_df.to_csv('指标相关性分析.csv', index=False) + + data = correlation_df['Pearson_Correlation'].values.tolist() + # 生成 -1 到 1 的 20 个区间 + bins = np.linspace(-1, 1, 21) + # 计算每个区间的统计数(这里是区间内数据的数量) + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + + #设置画布大小 + plt.figure(figsize=(10, 6)) + # 绘制直方图 + plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) + + # 添加标题和坐标轴标签 + plt.title('皮尔逊相关系数分布图') + plt.xlabel('区间') + plt.ylabel('统计数') + plt.savefig('皮尔逊相关性系数.png') + plt.close() + + + #设置画布大小 + plt.figure(figsize=(10, 6)) + data = correlation_df['Spearman_Correlation'].values.tolist() + # 计算每个区间的统计数(这里是区间内数据的数量) + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + + # 绘制直方图 + plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) + + # 添加标题和坐标轴标签 + plt.title('斯皮尔曼相关系数分布图') + plt.xlabel('区间') + plt.ylabel('统计数') + plt.savefig('斯皮尔曼相关性系数.png') + plt.close() + + +# 邮件封装 +class SendMail(object): + def __init__(self,username,passwd,recv,title,content, + file=None,ssl=False, + email_host='smtp.qq.com',port=25,ssl_port=465): + ''' + :param username: 用户名 + :param passwd: 密码 + :param recv: 收件人,多个要传list ['a@qq.com','b@qq.com] + :param title: 邮件标题 + :param content: 邮件正文 + :param file: 附件路径,如果不在当前目录下,要写绝对路径,默认没有附件 + :param ssl: 是否安全链接,默认为普通 + :param email_host: smtp服务器地址,默认为163服务器 + :param port: 非安全链接端口,默认为25 + :param ssl_port: 安全链接端口,默认为465 + ''' + self.username = username #用户名 + self.passwd = passwd #密码 + self.recv = recv #收件人,多个要传list ['a@qq.com','b@qq.com] + self.title = title #邮件标题 + self.content = content #邮件正文 + self.file = file #附件路径,如果不在当前目录下,要写绝对路径 + self.email_host = email_host #smtp服务器地址 + self.port = port #普通端口 + self.ssl = ssl #是否安全链接 + self.ssl_port = ssl_port #安全链接端口 + def send_mail(self): + msg = MIMEMultipart() + #发送内容的对象 + if self.file:#处理附件的 + file_name = os.path.split(self.file)[-1]#只取文件名,不取路径 + try: + f = open(self.file, 'rb').read() + except Exception as e: + raise Exception('附件打不开!!!!') + else: + att = MIMEText(f,"base64", "utf-8") + att["Content-Type"] = 'application/octet-stream' + #base64.b64encode(file_name.encode()).decode() + new_file_name='=?utf-8?b?' + base64.b64encode(file_name.encode()).decode() + '?=' + #这里是处理文件名为中文名的,必须这么写 + att["Content-Disposition"] = 'attachment; filename="%s"'%(new_file_name) + msg.attach(att) + msg.attach(MIMEText(self.content))#邮件正文的内容 + msg['Subject'] = self.title # 邮件主题 + msg['From'] = self.username # 发送者账号 + msg['To'] = ','.join(self.recv) # 接收者账号列表 + if self.ssl: + self.smtp = smtplib.SMTP_SSL(self.email_host,port=self.ssl_port) + else: + self.smtp = smtplib.SMTP(self.email_host,port=self.port) + #发送邮件服务器的对象 + self.smtp.login(self.username,self.passwd) + try: + self.smtp.sendmail(self.username,self.recv,msg.as_string()) + pass + except Exception as e: + print('出错了。。',e) + logger.info('邮件服务出错了。。',e) + else: + print('发送成功!') + logger.info('邮件发送成功!') + self.smtp.quit() + +def dateConvert(df, datecol='ds'): + # 将date列转换为datetime类型 + try: + df[datecol] = pd.to_datetime(df[datecol],format=r'%Y-%m-%d') + except: + df[datecol] = pd.to_datetime(df[datecol],format=r'%Y/%m/%d') + return df + + +class SQLiteHandler: + def __init__(self, db_name): + self.db_name = db_name + self.connection = None + self.cursor = None + + def connect(self): + self.connection = sqlite3.connect(self.db_name) + self.cursor = self.connection.cursor() + + def close(self): + if self.connection: + self.connection.close() + self.connection = None + self.cursor = None + + def execute_query(self, query, params=None): + if params: + return self.cursor.execute(query, params) + else: + return self.cursor.execute(query) + + def commit(self): + self.connection.commit() + + def create_table(self, table_name, columns): + query = f"CREATE TABLE IF NOT EXISTS {table_name} ({columns})" + self.execute_query(query) + self.commit() + + def insert_data(self, table_name, values, columns=None): + if columns: + placeholders = ', '.join(['?'] * len(values)) + query = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders})" + else: + placeholders = ', '.join(['?'] * len(values)) + query = f"INSERT INTO {table_name} VALUES ({placeholders})" + self.execute_query(query, values) + self.commit() + + def select_data(self, table_name, columns=None, where_condition=None, order_by=None, limit=None): + query = f"SELECT {', '.join(columns) if columns else '*'} FROM {table_name}" + if where_condition: + query += f" WHERE {where_condition}" + if order_by: + query += f" ORDER BY {order_by}" + if limit: + query += f" LIMIT {limit}" + results = self.execute_query(query).fetchall() + if results: + headers = [description[0] for description in self.execute_query(query).description] + return pd.DataFrame(results, columns=headers) + else: + return pd.DataFrame() + + def update_data(self, table_name, set_values, where_condition): + query = f"UPDATE {table_name} SET {set_values} WHERE {where_condition}" + logger.info('更新数据sql'+ query) + self.execute_query(query) + self.commit() + + def delete_data(self, table_name, where_condition): + query = f"DELETE FROM {table_name} WHERE {where_condition}" + self.execute_query(query) + self.commit() + + def check_table_exists(self, table_name): + query = f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'" + result = self.execute_query(query).fetchone() + return result is not None + + def add_column_if_not_exists(self, table_name, column_name, column_type): + # 查询表结构 + query = f"PRAGMA table_info({table_name})" + self.execute_query(query) + columns = [column[1] for column in self.cursor.fetchall()] + + # 判断列是否存在 + if column_name not in columns: + # 如果列不存在,则添加列 + query = f"ALTER TABLE {table_name} ADD COLUMN {column_name} {column_type}" + self.execute_query(query) + self.commit() + print(f"Column '{column_name}' added to table '{table_name}' successfully.") + else: + print(f"Column '{column_name}' already exists in table '{table_name}'.") + + + + +if __name__ == '__main__': + print('This is a tool, not a script.') \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..e5435f3 --- /dev/null +++ b/main.py @@ -0,0 +1,176 @@ +# 读取配置 +# from config_jingbo import * +# from config_tansuanli import * +from config_juxiting import * +from lib.dataread import * +from lib.tools import * +from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf,pp_export_pdf + +import glob +import torch +torch.set_float32_matmul_precision("high") + +sqlitedb = SQLiteHandler(db_name) +sqlitedb.connect() + +def predict_main(): + signature = BinanceAPI(APPID, SECRET) + etadata = EtaReader(signature=signature, + classifylisturl = classifylisturl, + classifyidlisturl=classifyidlisturl, + edbcodedataurl=edbcodedataurl, + edbcodelist=edbcodelist, + edbdatapushurl=edbdatapushurl, + edbdeleteurl=edbdeleteurl, + edbbusinessurl=edbbusinessurl + ) + # 获取数据 + if is_eta: + # eta数据 + logger.info('从eta获取数据...') + signature = BinanceAPI(APPID, SECRET) + etadata = EtaReader(signature=signature, + classifylisturl = classifylisturl, + classifyidlisturl=classifyidlisturl, + edbcodedataurl=edbcodedataurl, + edbcodelist=edbcodelist, + edbdatapushurl=edbdatapushurl, + edbdeleteurl=edbdeleteurl, + edbbusinessurl=edbbusinessurl, + ) + + # df_zhibiaoshuju,df_zhibiaoliebiao = etadata.get_eta_api_yuanyou_data(data_set=data_set,dataset=dataset) # 原始数据,未处理 + df_zhibiaoshuju,df_zhibiaoliebiao = etadata.get_eta_api_pp_data(data_set=data_set,dataset=dataset) # 原始数据,未处理 + + + # 数据处理 + df = datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,y = y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) + + else: + logger.info('读取本地数据:'+os.path.join(dataset,data_set)) + df = getdata(filename=os.path.join(dataset,data_set),y=y,dataset=dataset,add_kdj=add_kdj,is_timefurture=is_timefurture,end_time=end_time) # 原始数据,未处理 + + # 更改预测列名称 + df.rename(columns={y:'y'},inplace=True) + + if is_edbnamelist: + df = df[edbnamelist] + df.to_csv(os.path.join(dataset,'指标数据.csv'), index=False) + # 保存最新日期的y值到数据库 + # 取第一行数据存储到数据库中 + first_row = df[['ds','y']].tail(1) + # 将最新真实值保存到数据库 + if not sqlitedb.check_table_exists('trueandpredict'): + first_row.to_sql('trueandpredict',sqlitedb.connection,index=False) + else: + for row in first_row.itertuples(index=False): + row_dict = row._asdict() + check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'") + if len(check_query) > 0: + set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) + sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'") + continue + sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=row_dict.keys()) + + import datetime + # 判断当前日期是不是周一 + is_weekday = datetime.datetime.now().weekday() == 3 + if is_weekday: + logger.info('今天是周一,更新预测模型') + # 计算最近20天预测残差最低的模型名称 + + model_results = sqlitedb.select_data('trueandpredict',order_by = "ds DESC",limit = "20") + model_results = model_results.dropna() + modelnames = model_results.columns.to_list()[2:] + for col in model_results[modelnames].select_dtypes(include=['object']).columns: + model_results[col] = model_results[col].astype(np.float32) + # 计算每个预测值与真实值之间的偏差率 + for model in modelnames: + model_results[f'{model}_abs_error_rate'] = abs(model_results['y'] - model_results[model]) / model_results['y'] + + # 获取每行对应的最小偏差率值 + min_abs_error_rate_values = model_results.apply(lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].min(), axis=1) + # 获取每行对应的最小偏差率值对应的列名 + min_abs_error_rate_column_name = model_results.apply(lambda row: row[[f'{model}_abs_error_rate' for model in modelnames]].idxmin(), axis=1) + # 将列名索引转换为列名 + min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0]) + # 取出现次数最多的模型名称 + most_common_model = min_abs_error_rate_column_name.value_counts().idxmax() + logger.info(f"最近20天预测残差最低的模型名称:{most_common_model}") + + # 保存结果到数据库 + + if not sqlitedb.check_table_exists('most_model'): + sqlitedb.create_table('most_model',columns="ds datetime, most_common_model TEXT") + sqlitedb.insert_data('most_model',(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),most_common_model,),columns=('ds','most_common_model',)) + + + + + + + if is_corr: + df = corr_feature(df=df) + + df1 = df.copy() # 备份一下,后面特征筛选完之后加入ds y 列用 + logger.info(f"开始训练模型...") + row,col = df.shape + + now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + # ex_Model(df, + # horizon=horizon, + # input_size=input_size, + # train_steps=train_steps, + # val_check_steps=val_check_steps, + # early_stop_patience_steps=early_stop_patience_steps, + # is_debug=is_debug, + # dataset=dataset, + # is_train=is_train, + # is_fivemodels=is_fivemodels, + # val_size=val_size, + # test_size=test_size, + # settings=settings, + # now=now, + # etadata = etadata, + # modelsindex = modelsindex, + # data = data, + # is_eta=is_eta, + # ) + + # # 模型评估 + model_results3 = model_losss(sqlitedb) + # 模型报告 + + title = f'{settings}--{now}-预测报告' # 报告标题 + brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time, + reportname=reportname,sqlitedb=sqlitedb), + # pp_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time, + # reportname=reportname), + logger.info('模型训练完成') + + # tansuanli_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,end_time=end_time,reportname=reportname) + + # # LSTM 单变量模型 + # ex_Lstm(df,input_seq_len=input_size,output_seq_len=horizon,is_debug=is_debug,dataset=dataset) + + # # lstm 多变量模型 + # ex_Lstm_M(df,n_days=input_size,out_days=horizon,is_debug=is_debug,datasetpath=dataset) + + # # GRU 模型 + # # ex_GRU(df) + + # 发送邮件 + m = SendMail( + username=username, + passwd=passwd, + recv=recv, + title=title, + content=content, + file=max(glob.glob(os.path.join(dataset,'*.pdf')), key=os.path.getctime), + ssl=ssl, + ) + # m.send_mail() + + +if __name__ == '__main__': + predict_main() \ No newline at end of file diff --git a/maincanshu.py b/maincanshu.py new file mode 100644 index 0000000..6029eef --- /dev/null +++ b/maincanshu.py @@ -0,0 +1,123 @@ +# 读取配置 +from config_jingbo import * +from lib.tools import * +from lib.dataread import * +from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf +from models.lstmmodels import ex_Lstm_M,ex_Lstm +from models.grumodels import ex_GRU +import glob +import torch +torch.set_float32_matmul_precision("high") + +if __name__ == '__main__': + signature = BinanceAPI(APPID, SECRET) + + # 遍历参数训练模型 + input_size_list = [14] + horizon_list = [7] + train_steps_list = [500,1000,1500,2000] + k_list = [10,18,25,50,100] + end_time_list = ['2024-07-03'] + is_debug = False + is_fivemodels = False # 是否使用之前保存的最佳的5个模型 + delweekenday = True + # 组合上面三个参数 + for i in range(len(input_size_list)): + for h in range(len(horizon_list)): + for j in range(len(train_steps_list)): + for k in range(len(k_list)): + for end_time in end_time_list: + input_size = input_size_list[i] + horizons = horizon_list[h] + train_steps = train_steps_list[j] + K = k_list[k] + settings = f'{input_size}-{horizon_list[h]}-{train_steps}-{K}-{data_set}-{end_time}-{y}' + logger.info(f'当前配置:{settings}') + + # 获取数据 + if is_eta: + etadata = EtaReader(signature=signature, + classifylisturl = classifylisturl, + classifyidlisturl=classifyidlisturl, + edbcodedataurl=edbcodedataurl, + edbcodelist=edbcodelist + ) + df = etadata.get_eta_api_data(data_set=data_set,dataset=dataset) # 原始数据,未处理 + else: + filename = os.path.join(dataset,data_set) + logger.info(f'未启用Eta数据,将读取本地数据{filename}') + df = pd.read_excel(filename,sheet_name='指标数据') + + # 数据处理 + df = datachuli(df=df,dataset=dataset,end_time=end_time,y=y,delweekenday=delweekenday) + + if is_timefurture: + df = addtimecharacteristics(df=df,dataset=dataset) + + # 更改预测列名称 + df.rename(columns={y:'y'},inplace=True) + + logger.info(f"开始训练模型...") + row,col = df.shape + logger.info(f'当前配置:{settings}') + # 获取日期时间 计算今天日期 %Y-%m-%d-%H-%M-%S + from datetime import datetime + now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + ex_Model(df, + horizon=horizon, + input_size=input_size, + train_steps=train_steps, + val_check_steps=val_check_steps, + early_stop_patience_steps=early_stop_patience_steps, + is_debug=is_debug, + dataset=dataset, + is_train=is_train, + is_fivemodels=is_fivemodels, + val_size=val_size, + test_size=test_size, + settings=settings, + now=now + ) + + # 模型评估 + model_results3 = model_losss(df,dataset=dataset,horizon=horizon) + # 模型报告 + + reportname = f'{settings}--{now}-预测报告.pdf' # 报告文件名 + reportname = reportname.replace(':', '-') # 替换冒号 + title = f'{settings}--{now}-预测报告' # 报告标题 + brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time, + reportname=reportname), + + # 发送邮件 + m = SendMail( + username=username, + passwd=passwd, + recv=recv, + title=title, + content=content, + file=max(glob.glob(os.path.join(dataset,reportname)), key=os.path.getctime), + ssl=ssl, + ) + # m.send_mail() + + # # LSTM 单变量模型 + # ex_Lstm(df,input_seq_len=input_size,output_seq_len=horizon,is_debug=is_debug,dataset=dataset) + + # # lstm 多变量模型 + # ex_Lstm_M(df,n_days=input_size,out_days=horizon,is_debug=is_debug,datasetpath=dataset) + + # # GRU 模型 + # # ex_GRU(df) + + # 发送邮件 + # m = SendMail( + # username=username, + # passwd=passwd, + # recv=recv, + # title=title, + # content=content, + # file=max(glob.glob(os.path.join(dataset,'*.pdf')), key=os.path.getctime), + # ssl=ssl, + # ) + # m.send_mail() \ No newline at end of file diff --git a/models/grumodels.py b/models/grumodels.py new file mode 100644 index 0000000..c1d7602 --- /dev/null +++ b/models/grumodels.py @@ -0,0 +1,164 @@ +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib as mpl + +mpl.rcParams['font.family'] = 'SimHei' # 设置字体为黑体 +import random +import string +import time +import base64 +import requests +from hashlib import sha256 +from hmac import HMAC +from math import sqrt +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics import mean_squared_error +from keras.models import Sequential +from keras.layers import GRU, Dense, Dropout +from keras.optimizers import Adam +from keras.callbacks import EarlyStopping + +# 数据获取和预处理部分 + +from sklearn.preprocessing import MinMaxScaler +import pandas as pd + +import datetime +import string +import base64 +import requests +import random +import time +import re +import hmac +import hashlib + + +def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): + ''' + 将时间序列数据转换为监督学习数据 + :param data:数据集 + :param n_in: 输入序列长度,默认为1 + :param n_out:输出序列长度,默认为1 + :param dropnan: + :return: + ''' + n_vars = 1 if type(data) is list else data.shape[1] + df = pd.DataFrame(data) + cols, names = list(), list() + # input sequence (t-n, ... t-1) + # 将3组输入数据依次向下移动3,2,1行,将数据加入cols列表(技巧:(n_in, 0, -1)中的-1指倒序循环,步长为1) + for i in range(n_in, 0, -1): + cols.append(df.shift(i)) + names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] + # forecast sequence (t, t+1, ... t+n) + # 将一组输出数据加入cols列表(技巧:其中i=0) + for i in range(0, n_out): + cols.append(df.shift(-i)) + if i == 0: + names += [('var%d(t)' % (j + 1)) for j in range(n_vars)] + else: + names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] + # cols列表(list)中现在有四块经过下移后的数据(即:df(-3),df(-2),df(-1),df),将四块数据按列 并排合并 + agg = pd.concat(cols, axis=1) + # 给合并后的数据添加列名 + agg.columns = names + # 删除NaN值列 + if dropnan: + agg.dropna(inplace=True) + return agg + + +def ex_GRU(df): + dataset = df.copy() + dataset.set_index('ds', inplace=True) + values = dataset.values + + # 标准化/放缩 特征值在(0,1)之间 + scaler = MinMaxScaler(feature_range=(0, 1)) + scaled = scaler.fit_transform(values) + + # 数据准备 + n_days = 14 # 使用过去14天的数据 + n_features = scaled.shape[1] # 特征数量根据实际数据调整 + reframed = series_to_supervised(scaled, n_days, 1) + + # 划分训练集和测试集 + values = reframed.values + n_train_days = int(values.shape[0] * 0.8) + train = values[:n_train_days, :] + test = values[n_train_days:, :] + + # 输入输出数据 + n_obs = n_days * n_features + train_X, train_y = train[:, :n_obs], train[:, -n_features] + test_X, test_y = test[:, :n_obs], test[:, -n_features] + + # 输入数据重塑为 [样本数, 时间步长, 特征数] + train_X = train_X.reshape((train_X.shape[0], n_days, n_features)) + test_X = test_X.reshape((test_X.shape[0], n_days, n_features)) + + # 构造GRU模型 + model = Sequential() + model.add(GRU(50, return_sequences=True, input_shape=(n_days, n_features))) + model.add(Dropout(0.2)) + model.add(GRU(50)) + model.add(Dropout(0.2)) + model.add(Dense(1)) + + # 编译模型 + optimizer = Adam(learning_rate=0.001) + model.compile(loss='mean_squared_error', optimizer=optimizer) + + # 定义回调函数 + early_stopping = EarlyStopping(monitor='val_loss', patience=10) + + # 训练模型 + history = model.fit(train_X, train_y, epochs=100, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[early_stopping]) + + # 执行预测 + yhat = model.predict(test_X) + test_X = test_X.reshape((test_X.shape[0], n_days * n_features)) + + # 将预测列和真实列数据逆归一化 + inv_yhat = np.concatenate((yhat, test_X[:, -n_features+1:]), axis=1) + inv_yhat = scaler.inverse_transform(inv_yhat) + inv_yhat = inv_yhat[:, 0] + + test_y = test_y.reshape((len(test_y), 1)) + inv_y = np.concatenate((test_y, test_X[:, -n_features+1:]), axis=1) + inv_y = scaler.inverse_transform(inv_y) + inv_y = inv_y[:, 0] + + # 计算RMSE + rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) + print('Test RMSE: %.3f' % rmse) + + # 可视化结果 + n = 150 + time_axis_data = np.array(range(n)) + time_axis_future = np.array(range(n + 7)) + inv_y = inv_y[-n:] + inv_yhat = inv_yhat[-n-7:] + + fig, ax = plt.subplots(2, 1, gridspec_kw={'height_ratios': [5, 4]}) + fig.set_size_inches(8, 6) + + ax[0].plot(time_axis_data, inv_y, label='历史价格') + ax[0].plot(time_axis_future, inv_yhat, linestyle='dashed', label='预测价格') + ax[0].set_xlabel('时间') + ax[0].set_ylabel('价格') + ax[0].legend() + ax[0].set_title('布伦特_多价格预测') + ax[0].set_ylim(min(inv_y[-n - 7:]) * 0.4, max(inv_y[-n - 7:]) * 1.6) + + ax[1].axis('off') + table_data = [[f"Day {i + 1}", "{:.2f}".format(val)] for i, val in enumerate(inv_yhat[-7:])] + table = ax[1].table(cellText=table_data, colLabels=['Day', 'Prediction'], loc='center') + table.auto_set_font_size(True) + filename = os.path.basename(__file__).split('.')[0] + + plt.savefig(filename + '.png') + plt.show() diff --git a/models/lstmmodels.py b/models/lstmmodels.py new file mode 100644 index 0000000..57f7ee4 --- /dev/null +++ b/models/lstmmodels.py @@ -0,0 +1,255 @@ +import numpy as np +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import LSTM, Dense +import datetime +import matplotlib.pyplot as plt +import pandas as pd +import os +import random +import string +import time +import base64 +from hashlib import sha256 +from hmac import HMAC +import requests +import csv +from numpy import concatenate +from math import sqrt + +def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): + ''' + 将时间序列数据转换为监督学习数据 + :param data:数据集 + :param n_in: 输入序列长度,默认为1 + :param n_out:输出序列长度,默认为1 + :param dropnan: + :return: + ''' + n_vars = 1 if type(data) is list else data.shape[1] + df = pd.DataFrame(data) + cols, names = list(), list() + # input sequence (t-n, ... t-1) + # 将3组输入数据依次向下移动3,2,1行,将数据加入cols列表(技巧:(n_in, 0, -1)中的-1指倒序循环,步长为1) + for i in range(n_in, 0, -1): + cols.append(df.shift(i)) + names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] + # forecast sequence (t, t+1, ... t+n) + # 将一组输出数据加入cols列表(技巧:其中i=0) + for i in range(0, n_out): + cols.append(df.shift(-i)) + if i == 0: + names += [('var%d(t)' % (j + 1)) for j in range(n_vars)] + else: + names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] + # cols列表(list)中现在有四块经过下移后的数据(即:df(-3),df(-2),df(-1),df),将四块数据按列 并排合并 + agg = pd.concat(cols, axis=1) + # 给合并后的数据添加列名 + agg.columns = names + # 删除NaN值列 + if dropnan: + agg.dropna(inplace=True) + return agg + +def createXY(dataset,n_past): + dataX = [] + dataY = [] + print(dataset.shape[1]) + for i in range(n_past, len(dataset)): + dataX.append(dataset[i - n_past:i, 0:dataset.shape[1]]) + dataY.append(dataset[i,0]) + return np.array(dataX),np.array(dataY) + +def ex_Lstm_M(df,n_days=14,out_days=7,is_debug=False,datasetpath=''): + # dataset = pd.read_csv('brentpricepredict.csv',encoding='utf-8') + dataset = df.copy() + dataset.set_index('ds', inplace=True) + + values = dataset.values + if is_debug: + # values = values[-1000:] + pass + # 标准化/放缩 特征值在(0,1)之间 + scaler = MinMaxScaler(feature_range=(0, 1)) + scaled = scaler.fit_transform(values) + # 用14天数据预测七天数据 + n_features = dataset.shape[1] + # 构造一个14->7的监督学习型数据 + reframed = series_to_supervised(scaled, n_days, out_days) + + # 切分数据集 + values = reframed.values + # 用80%的数据来训练,20%的数据来测试 + n_train = int(len(dataset) * 0.8) + train = values[:n_train, :] + test = values[n_train:, :] + # 切分输入输出 + n_obs = n_days * n_features + # 倒数第19列作为Y + train_X, train_y = train[:, :n_obs], train[:, -n_features] + test_X, test_y = test[:, :n_obs], test[:, -n_features] + # 将数据转换为3D输入,timesteps=14,14条数据预测7条 [samples, timesteps, features] + train_X = train_X.reshape((train_X.shape[0], n_days, n_features)) + test_X = test_X.reshape((test_X.shape[0], n_days, n_features)) + print(train_X.shape, train_y.shape, test_X.shape, test_y.shape) + + # 设计网络 + model = Sequential() + model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]))) + model.add(Dense(1)) + model.compile(loss='mae', optimizer='adam') + # 拟合网络 + history = model.fit(train_X, train_y, epochs=100, batch_size=72, validation_data=(test_X, test_y), verbose=2, + shuffle=False) + + # 执行预测 + yhat = model.predict(test_X) + # 将数据格式化成 n行 * 24列 + test_X = test_X.reshape((test_X.shape[0], n_days * n_features)) + # 将预测列据和后7列数据拼接,因后续逆缩放时,数据形状要符合 n行*20列 的要求 + inv_yhat = concatenate((yhat, test_X[:, -n_features+1:]), axis=1) + # 对拼接好的数据进行逆缩放 + inv_yhat = scaler.inverse_transform(inv_yhat) + inv_yhat = inv_yhat[:, 0] + print(inv_yhat) + + test_y = test_y.reshape((len(test_y), 1)) + # 将真实列据和后7列数据拼接,因后续逆缩放时,数据形状要符合 n行*20列 的要求 + inv_y = concatenate((test_y, test_X[:, -n_features+1:]), axis=1) + # 对拼接好的数据进行逆缩放 + inv_y = scaler.inverse_transform(inv_y) + inv_y = inv_y[:, 0] + + # 计算RMSE + rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) + print('Test RMSE: %.3f' % rmse) + + # 可视化结果 + # 保留n天历史数据 + n = len(inv_y) - 7 + # 设置要可视化的值 + time_axis_data = np.array(range(n)) + time_axis_future = np.array(range(n + 7)) + inv_y = inv_y[-n:] + inv_yhat = inv_yhat[-n-7:] + + # Plot data and future predictions + fig, ax = plt.subplots(2, 1, gridspec_kw={'height_ratios': [5, 4]}) + # 设置画布大小 + fig.set_size_inches(6, 6) + # 第一个子图画历史价格和预测价格 + ax[0].plot(time_axis_data, inv_y, label='历史价格') + ax[0].plot(time_axis_future, inv_yhat, linestyle='dashed', label='预测价格') + ax[0].set_xlabel('时间') + ax[0].set_ylabel('价格') + ax[0].legend() + # 设置标题 + ax[0].set_title('布伦特_多价格预测') + # 设置y轴范围 + ax[0].set_ylim(50, 120) + + # 第二个子图画表格,展示预测价格 + ax[1].axis('off') + table_data = [[f"Day {i + 1}", "{:.2f}".format(val)] for i, val in enumerate(inv_yhat[-7:])] + table = ax[1].table(cellText=table_data, colLabels=['Day', 'Prediction'], loc='center') + # 设置表格内容居中 + table.auto_set_font_size(True) + # 保存图片 + filename = os.path.basename(__file__).split('.')[0] + + plt.savefig(os.path.join(datasetpath,filename + '_M.png')) + # plt.show() + + +def ex_Lstm(df,input_seq_len=50, output_seq_len=7,is_debug=False,dataset=''): + + # 将日期列转换为 datetime 类型(如果尚未转换) + df['ds'] = pd.to_datetime(df['ds']) + # 分离出数值列(排除日期列) + numeric_df = df.select_dtypes(include=['int64', 'float64']) + + prices = df + # prices = df + # print(data1) + # Remove any NaN values + df = df.drop('ds', axis=1) + prices = np.array(df, dtype=float) # convert to NumPy array of floats + prices = prices[~np.isnan(prices)] + if is_debug: + prices = prices[-300:] + + + # Prepare input sequences + inputs = [] + for i in range(len(prices)-input_seq_len-output_seq_len+1): + inputs.append(prices[i:i+input_seq_len]) + inputs = np.array(inputs) + + # Prepare output sequences + outputs = [] + for i in range(input_seq_len, len(prices)-output_seq_len+1): + outputs.append(prices[i:i+output_seq_len]) + outputs = np.array(outputs) + + # Split dataset into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.2) + + # Normalize data + scaler_in = MinMaxScaler() + X_train = scaler_in.fit_transform(X_train) + X_test = scaler_in.transform(X_test) + + scaler_out = MinMaxScaler() + y_train = scaler_out.fit_transform(y_train) + y_test = scaler_out.transform(y_test) + + # Define LSTM model + model = Sequential() + model.add(LSTM(128, activation='relu', input_shape=(input_seq_len, 1))) + model.add(Dense(output_seq_len)) + model.compile(optimizer='adam', loss='mse') + + # Train LSTM model + model.fit(X_train.reshape(-1, input_seq_len, 1), y_train, epochs=100, batch_size=64, validation_data=(X_test.reshape(-1, input_seq_len, 1), y_test)) + + # Evaluate LSTM model + mse = model.evaluate(X_test.reshape(-1, input_seq_len, 1), y_test) + + # Make future predictions + future_inputs = np.array([prices[-input_seq_len:]]) + future_inputs = scaler_in.transform(future_inputs) + future_predictions = model.predict(future_inputs.reshape(-1, input_seq_len, 1)) + future_predictions = scaler_out.inverse_transform(future_predictions)[0] + + # Print results + print("MSE: ", mse) + print("Future predictions: ", future_predictions) + + # Generate time axis for data and future predictions + time_axis_data = np.arange(len(prices)) + time_axis_future = np.arange(len(prices), len(prices) + len(future_predictions)) + + # Concatenate time axis and data + time_axis = np.concatenate((time_axis_data, time_axis_future)) + + # Concatenate data and future predictions + data_and_predictions = np.concatenate((prices, future_predictions)) + + # Plot data and future predictions + fig, ax = plt.subplots(2, 1, gridspec_kw={'height_ratios': [3, 1]}) + + # First subplot: Data and Future Predictions + ax[0].plot(time_axis, data_and_predictions, label='Data and Future Predictions') + ax[0].plot(time_axis_future, future_predictions, linestyle='dashed', label='Future Predictions') + ax[0].set_xlabel('Time') + ax[0].set_ylabel('Price') + ax[0].legend() + + # Second subplot: Table for Future Predictions + ax[1].axis('off') + table_data = [[f"Day {i+1}", "{:.2f}".format(val)] for i, val in enumerate(future_predictions)] + table = ax[1].table(cellText=table_data, colLabels=['Day', 'Prediction'], loc='center') + plt.savefig(os.path.join(dataset,'lstmmodels.png')) + # plt.show() \ No newline at end of file diff --git a/models/nerulforcastmodels.py b/models/nerulforcastmodels.py new file mode 100644 index 0000000..009e0db --- /dev/null +++ b/models/nerulforcastmodels.py @@ -0,0 +1,1519 @@ +import os +import pandas as pd +import numpy as np +import tensorflow as tf +import seaborn as sns +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +import datetime +from lib.tools import Graphs,mse,rmse,mae +from lib.dataread import * +from neuralforecast import NeuralForecast +from neuralforecast.models import NHITS,Informer, NBEATSx,LSTM,PatchTST, iTransformer, TSMixer +from neuralforecast.models import RNN, GRU, TCN, DeepAR, DilatedRNN, MLP, NBEATS, DLinear, NLinear, TFT, VanillaTransformer +from neuralforecast.models import Autoformer, PatchTST, FEDformer, StemGNN, HINT, TSMixer, TSMixerx, MLPMultivariate, BiTCN, TiDE, DeepNPTS +from tensorflow.keras.losses import MAE +from scipy.stats import spearmanr +from sklearn.preprocessing import MinMaxScaler +from sklearn.feature_selection import SelectKBest, f_classif +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import r2_score +from sklearn import metrics +from lib.duojinchengpredict import testSetPredict +from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类 +from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch) +from reportlab.pdfbase import pdfmetrics # 注册字体 +from reportlab.pdfbase.ttfonts import TTFont # 字体类 +from reportlab.platypus import Table, SimpleDocTemplate, Paragraph, Image # 报告内容相关类 +from reportlab.lib.pagesizes import letter # 页面的标志尺寸(8.5*inch, 11*inch) +from reportlab.lib.styles import getSampleStyleSheet # 文本样式 +from reportlab.lib import colors # 颜色模块 +from reportlab.graphics.charts.barcharts import VerticalBarChart # 图表类 +from reportlab.graphics.charts.legends import Legend # 图例类 +from reportlab.graphics.shapes import Drawing # 绘图工具 +from reportlab.lib.units import cm # 单位:cm +# # 注册字体(提前准备好字体文件, 如果同一个文件需要多种字体可以注册多个) +pdfmetrics.registerFont(TTFont('SimSun', 'SimSun.ttf')) + + + +def ex_Model(df,horizon,input_size,train_steps,val_check_steps,early_stop_patience_steps, + is_debug,dataset,is_train,is_fivemodels,val_size,test_size,settings,now, + etadata,modelsindex,data,is_eta): + ''' + 模型训练与预测 + :param df: 数据集 + horizon # 预测的步长 + input_size # 输入序列长度 + train_steps # 训练步数,用来限定epoch次数 + val_check_steps # 评估频率 + early_stop_patience_steps # 早停的耐心步数 + :return: 预测结果 + ''' + + # 模型预测列表列名 + # columns2 = [ + # 'NHITS', + # 'Informer', + # 'LSTM', + # 'iTransformer', + # 'TSMixer', + # 'TSMixerx', + # 'PatchTST', + # 'RNN', + # 'GRU', + # 'TCN', + # # 'DeepAR', + # 'DeepAR-median', + # 'DeepAR-lo-90', + # 'DeepAR-lo-80', + # 'DeepAR-hi-80', + # 'DeepAR-hi-90', + # 'BiTCN', + # 'DilatedRNN', + # 'MLP', + # 'DLinear', + # 'NLinear', + # 'TFT', + # 'FEDformer', + # 'StemGNN', + # 'MLPMultivariate', + # 'TiDE', + # 'DeepNPT', + # ] + + df= df.replace(',', '', regex=True) + df = df.rename(columns={'date': 'ds'}) + df['y'] = pd.to_numeric(df['y'], errors='coerce') + df['ds'] = pd.to_datetime(df['ds'], errors='coerce') # 使用errors='coerce'来处理无效日期 + # df 数值列转为 float32 + for col in df.select_dtypes(include=['int']).columns: + df[col] = df[col].astype(np.float32) + + # 设置中文字体 + plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 + plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + + + # 不筛选特征用下面的 + df_reg = df + df_reg.sort_values('ds', inplace=True) + if is_debug: + df_reg = df_reg[-1000:-1] + + # 计算训练集的结束索引,占总数据的90% + split_index = int(0.8* len(df_reg)) + + # 按照时间顺序划分训练集和测试集 + df_train = df_reg[:split_index] + df_test = df_reg[-split_index:] + df_train['unique_id'] = 1 + df_test['unique_id'] = 1 + + # 显示划分后的数据集的前几行 + logger.info("Training set head:") + logger.info(df_train.head()) + + logger.info("\nTesting set head:") + logger.info(df_test.head()) + + + models = [ + NHITS (h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', activation='ReLU', early_stop_patience_steps=early_stop_patience_steps), + Informer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps ), + LSTM(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + iTransformer(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TSMixer(h=horizon, input_size=input_size, n_series = 1, max_steps=train_steps, early_stop_patience_steps=early_stop_patience_steps), + TSMixerx(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, early_stop_patience_steps=early_stop_patience_steps), + PatchTST(h=horizon, input_size=input_size, max_steps=train_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + RNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + GRU(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + # DeepAR(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + BiTCN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DilatedRNN(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + MLP(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + NLinear(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TFT(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + FEDformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + StemGNN(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + MLPMultivariate(h=horizon, input_size=input_size,n_series = 1, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + TiDE(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + DeepNPTS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', early_stop_patience_steps=early_stop_patience_steps), + + # VanillaTransformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), //报错了 + # Autoformer(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), //报错了 + # NBEATS(h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard', ), + # NBEATSx (h=horizon, input_size=input_size, max_steps=train_steps, val_check_steps=val_check_steps, scaler_type='standard',activation='ReLU', ), //报错 + # HINT(h=horizon), + + ] + + if is_fivemodels: + # 获取之前存好的最好的五个模型 + with open(os.path.join(dataset,'best_modelnames.txt'), 'r',encoding='utf-8') as f: + best_modelnames = f.readlines()[0] + logger.info(f'获取本地最佳模型名称:{best_modelnames}') + + # 重新拼接models + all_models = models + models = [] + for model in all_models: + if model._get_name() in best_modelnames: + models.append(model) + + # 创建NeuralForecast实例并训练模型 + nf = NeuralForecast(models=models, freq="B") + + from joblib import dump, load + if is_train: + # 模型交叉验证 + nf_preds = nf.cross_validation(df=df_train, val_size=val_size, test_size=test_size, n_windows=None) + nf_preds.to_csv(os.path.join(dataset,"cross_validation.csv"),index=False) + + nf_preds = nf_preds.reset_index() + # 保存模型 + # 生成文件名,按时间 精确到分 + filename = f'{settings}--{now}.joblib' + #文件名去掉冒号 + filename = filename.replace(':', '-') # 替换冒号 + # dump(nf, os.path.join(dataset,filename)) + else: + # glob获取dataset下最新的joblib文件 + import glob + filename = max(glob.glob(os.path.join(dataset,'*.joblib')), key=os.path.getctime) + # logger.info('读取模型:'+ filename) + nf = load(filename) + # # 测试集预测 + nf_test_preds = nf.cross_validation(df=df_test, val_size=val_size, test_size=test_size, n_windows=None) + # 测试集预测结果保存 + nf_test_preds.to_csv(os.path.join(dataset,"cross_validation.csv"),index=False) + + df_test['ds'] = pd.to_datetime(df_test['ds'], errors='coerce') + + #进行未来时间预测 + df_predict=nf.predict(df_test).reset_index() + df_predict.astype({col: 'float32' for col in df_predict.columns if col not in ['ds'] }) + + # 保存预测值 + df_predict.to_csv(os.path.join(dataset,"predict.csv"),index=False) + + # 把预测值上传到eta + if is_update_eta: + dates = df_predict['ds'].dt.strftime('%Y-%m-%d') + + for m in modelsindex.keys(): + list = [] + for date,value in zip(dates,df_predict[m].round(2)): + list.append({'Date':date,'Value':value}) + data['DataList'] = list + data['IndexCode'] = modelsindex[m] + data['IndexName'] = f'价格预测{m}模型' + data['Remark'] = m + etadata.push_data(data) + + + return nf_test_preds + + +# 计算预测评估指数 +def model_losss(sqlitedb): + global dataset + # 预测数据处理 predict + df_combined = loadcsv(os.path.join(dataset,"cross_validation.csv")) + df_combined = dateConvert(df_combined) + # 删除空列 + df_combined.dropna(axis=1,inplace=True) + # 删除缺失值,预测过程不能有缺失值 + df_combined.dropna(inplace=True) + # 其他列转为数值类型 + df_combined = df_combined.astype({col: 'float32' for col in df_combined.columns if col not in ['cutoff','ds'] }) + # 使用 groupby 和 transform 结合 lambda 函数来获取每个分组中 cutoff 的最小值,并创建一个新的列来存储这个最大值 + df_combined['max_cutoff'] = df_combined.groupby('ds')['cutoff'].transform('max') + + # 然后筛选出那些 cutoff 等于 max_cutoff 的行,这样就得到了每个分组中 cutoff 最大的行,并保留了其他列 + df_combined = df_combined[df_combined['cutoff'] == df_combined['max_cutoff']] + # 删除模型生成的cutoff列 + df_combined.drop(columns=['cutoff', 'max_cutoff'], inplace=True) + # 获取模型名称 + modelnames = df_combined.columns.to_list()[2:] + if 'y' in modelnames: + modelnames.remove('y') + df_combined3 = df_combined.copy() # 备份df_combined,后面画图需要 + + # 计算波动率 + df_combined3['volatility'] = df_combined3['y'].pct_change().round(4) + # 计算近60日的波动率 10% 90%分位数 + df_combined3['quantile_10'] = df_combined3['volatility'].rolling(60).quantile(0.1) + df_combined3['quantile_90'] = df_combined3['volatility'].rolling(60).quantile(0.9) + df_combined3 = df_combined3.round(4) + # 计算分位数对应的价格,并移动到第二天 + df_combined3['quantile_10_price'] = df_combined3['y'] * (1 + df_combined3['quantile_10']) + # df_combined3['quantile_10_price'] = df_combined3['quantile_10_price'].shift(1) + df_combined3['quantile_90_price'] = df_combined3['y'] * (1 + df_combined3['quantile_90']) + # df_combined3['quantile_90_price'] = df_combined3['quantile_90_price'].shift(1) + + # 遍历行 + def find_min_max_within_quantile(row): + # 获取分位数10%和90%的值 + q10 = row['quantile_10_price'] + q90 = row['quantile_90_price'] + + # 判断flot值是否为空值 + if pd.isna(q10) or pd.isna(q90): + return pd.Series([None, None, None, None], index=['min_within_quantile','max_within_quantile','min_model','max_model']) + + # 初始化最小和最大值为None + min_value = None + max_value = None + min_value_model = '' + max_value_model = '' + + + # 遍历指定列,找出在分位数范围内的最大最小值 + for model in modelnames: + value = row[model] + if value >= q10 and value <= q90: + if min_value is None or value < min_value: + min_value = value + min_value_model = model + + if max_value is None or value > max_value: + max_value = value + max_value_model = model + + # 返回最大最小值 + return pd.Series([min_value, max_value,min_value_model,max_value_model], index=['min_within_quantile', 'max_within_quantile','min_model','max_model']) + + # 应用函数到每一行 + df_combined3[['min_within_quantile', 'max_within_quantile','min_model','max_model']] = df_combined3.apply(find_min_max_within_quantile, axis=1) + + # 去除有空值的行 + df_combined3.dropna(inplace=True) + # 保存到数据库 + df_combined3.to_sql('testandpredict_groupby', sqlitedb.connection, if_exists='replace', index=False) + df_combined3.to_csv(os.path.join(dataset,"testandpredict_groupby.csv"),index=False) + + + # 空的列表存储每个模型的MSE、RMSE、MAE、MAPE、SMAPE + cellText = [] + + # 遍历模型名称,计算模型评估指标 + for model in modelnames: + modelmse = mse(df_combined['y'], df_combined[model]) + modelrmse = rmse(df_combined['y'], df_combined[model]) + modelmae = mae(df_combined['y'], df_combined[model]) + # modelmape = mape(df_combined['y'], df_combined[model]) + # modelsmape = smape(df_combined['y'], df_combined[model]) + # modelr2 = r2_score(df_combined['y'], df_combined[model]) + cellText.append([model,round(modelmse, 3), round(modelrmse, 3), round(modelmae, 3)]) + + model_results3 = pd.DataFrame(cellText,columns=['模型(Model)','平均平方误差(MSE)', '均方根误差(RMSE)', '平均绝对误差(MAE)']) + # 按MSE降序排列 + model_results3 = model_results3.sort_values(by='平均平方误差(MSE)', ascending=True) + model_results3.to_csv(os.path.join(dataset,"model_evaluation.csv"),index=False) + modelnames = model_results3['模型(Model)'].tolist() + allmodelnames = modelnames.copy() + # 保存5个最佳模型的名称 + if len(modelnames) > 5: + modelnames = modelnames[0:5] + with open(os.path.join(dataset,"best_modelnames.txt"), 'w') as f: + f.write(','.join(modelnames) + '\n') + + # 预测值与真实值对比图 + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.figure(figsize=(15, 10)) + # 设置有5个子图的画布 + for n,model in enumerate(modelnames): + plt.subplot(3, 2, n+1) + plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值') + plt.plot(df_combined3['ds'], df_combined3[model], label=model) + plt.legend() + plt.xlabel('日期') + plt.ylabel('价格') + plt.title(model+'拟合') + plt.subplots_adjust(hspace=0.5) + plt.savefig(os.path.join(dataset,'预测值与真实值对比图.png'), bbox_inches='tight') + plt.close() + + # 历史数据+预测数据 + # 拼接未来时间预测 + df_predict = loadcsv(os.path.join(dataset,'predict.csv')) + df_predict.drop('unique_id',inplace=True,axis=1) + df_predict.dropna(axis=1,inplace=True) + + try: + df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y-%m-%d') + except ValueError : + df_predict['ds'] = pd.to_datetime(df_predict['ds'],format=r'%Y/%m/%d') + + # 取第一行数据存储到数据库中 + first_row = df_predict.head(1) + first_row['ds'] = first_row['ds'].dt.strftime('%Y-%m-%d 00:00:00') + # 将预测结果保存到数据库 + if not sqlitedb.check_table_exists('trueandpredict'): + first_row.to_sql('trueandpredict',sqlitedb.connection,index=False) + else: + for row in first_row.itertuples(index=False): + row_dict = row._asdict() + columns=row_dict.keys() + for col in columns: + sqlitedb.add_column_if_not_exists('trueandpredict',col,'TEXT') + check_query = sqlitedb.select_data('trueandpredict',where_condition = f"ds = '{row.ds}'") + if len(check_query) > 0: + set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) + sqlitedb.update_data('trueandpredict',set_clause,where_condition = f"ds = '{row.ds}'") + continue + sqlitedb.insert_data('trueandpredict',tuple(row_dict.values()),columns=columns) + + + + # 最多频率的模型名称 + min_model_max_frequency_model = df_combined3['min_model'].value_counts().idxmax() + max_model_max_frequency_model = df_combined3['max_model'].value_counts().idxmax() + df_predict['min_model'] = min_model_max_frequency_model + df_predict['max_model'] = max_model_max_frequency_model + df_predict['min_within_quantile'] = df_predict[min_model_max_frequency_model] + df_predict['max_within_quantile'] = df_predict[max_model_max_frequency_model] + + df_predict2 = df_predict.copy() + df_predict2['ds'] = df_predict2['ds'].dt.strftime('%Y-%m-%d 00:00:00') + + + # 将预测结果保存到数据库 + # 判断表存在 + if not sqlitedb.check_table_exists('testandpredict_groupby'): + df_predict2.to_sql('testandpredict_groupby',sqlitedb.connection,index=False) + else: + for row in df_predict2.itertuples(index=False): + row_dict = row._asdict() + check_query = sqlitedb.select_data('testandpredict_groupby',where_condition = f"ds = '{row.ds}'") + if len(check_query) > 0: + set_clause = ", ".join([f"{key} = '{value}'" for key, value in row_dict.items()]) + sqlitedb.update_data('testandpredict_groupby',set_clause,where_condition = f"ds = '{row.ds}'") + continue + sqlitedb.insert_data('testandpredict_groupby',tuple(row_dict.values()),columns=row_dict.keys()) + + + + # 计算每个预测值与真实值之间的偏差率 + for model in allmodelnames: + df_combined3[f'{model}_abs_error_rate'] = abs(df_combined3['y'] - df_combined3[model]) / df_combined3['y'] + + # 获取每行对应的最小偏差率值 + min_abs_error_rate_values = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].min(), axis=1) + # 获取每行对应的最小偏差率值对应的列名 + min_abs_error_rate_column_name = df_combined3.apply(lambda row: row[[f'{model}_abs_error_rate' for model in allmodelnames]].idxmin(), axis=1) + # 将列名索引转换为列名 + min_abs_error_rate_column_name = min_abs_error_rate_column_name.map(lambda x: x.split('_')[0]) + # 获取最小偏差率对应的模型的预测值 + min_abs_error_rate_predictions = df_combined3.apply(lambda row: row[min_abs_error_rate_column_name[row.name]], axis=1) + # 将最小偏差率对应的模型的预测值添加到DataFrame中 + df_combined3['min_abs_error_rate_prediction'] = min_abs_error_rate_predictions + df_combined3['min_abs_error_rate_column_name'] = min_abs_error_rate_column_name + df_combined3 = pd.concat([df_combined3, df_predict]).reset_index(drop=True) + # 判断 df 的数值列转为float + for col in df_combined3.columns: + try: + if col != 'ds': + df_combined3[col] = df_combined3[col].astype(float) + df_combined3[col] = df_combined3[col].round(2) + except ValueError: + pass + df_combined3.to_csv(os.path.join(dataset,"df_combined3.csv"),index=False) + + # 历史价格+预测价格 + df_combined3 = df_combined3[-50:] # 取50个数据点画图 + # 历史价格 + plt.figure(figsize=(20, 10)) + plt.plot(df_combined3['ds'], df_combined3['y'], label='真实值') + # 颜色填充 + plt.fill_between(df_combined3['ds'], df_combined3['min_within_quantile'], df_combined3['max_within_quantile'], alpha=0.2) + # plt.plot(df_combined3['ds'], df_combined3['min_abs_error_rate_prediction'], label='最小绝对误差', linestyle='--', color='orange') + # 网格 + plt.grid(True) + # 显示历史值 + for i, j in zip(df_combined3['ds'], df_combined3['y']): + plt.text(i, j, str(j), ha='center', va='bottom') + + # 数据库查询最佳模型名称 + most_model = [sqlitedb.select_data('most_model',columns=['most_common_model'],order_by='ds desc',limit=1).values[0][0]] + + for model in most_model: + plt.plot(df_combined3['ds'], df_combined3[model], label=model,marker='o') + # 当前日期画竖虚线 + plt.axvline(x=df_combined3['ds'].iloc[-horizon], color='r', linestyle='--') + plt.legend() + plt.xlabel('日期') + plt.ylabel('价格') + + plt.savefig(os.path.join(dataset,'历史价格-预测值.png'), bbox_inches='tight') + plt.close() + + # 预测值表格 + fig, ax = plt.subplots(figsize=(20, 6)) + ax.axis('off') # 关闭坐标轴 + # 数值保留2位小数 + df_combined3 = df_combined3.round(2) + df_combined3 = df_combined3[-horizon:] + df_combined3['Day'] = [f'Day_{i}' for i in range(1,horizon+1)] + # Day列放到最前面 + df_combined3 = df_combined3[['Day'] + list(df_combined3.columns[:-1])] + table = ax.table(cellText=df_combined3.values, colLabels=df_combined3.columns, loc='center') + #加宽表格 + table.auto_set_font_size(False) + table.set_fontsize(10) + + # 设置表格样式,列数据最小的用绿色标识 + plt.savefig(os.path.join(dataset,'预测值表格.png'), bbox_inches='tight') + plt.close() + # plt.show() + + # 可视化评估结果 + plt.rcParams['font.sans-serif'] = ['SimHei'] + fig, ax = plt.subplots(figsize=(20, 10)) + ax.axis('off') # 关闭坐标轴 + table = ax.table(cellText=model_results3.values, colLabels=model_results3.columns, loc='center') + # 加宽表格 + table.auto_set_font_size(False) + table.set_fontsize(10) + + # 设置表格样式,列数据最小的用绿色标识 + plt.savefig(os.path.join(dataset,'模型评估.png'), bbox_inches='tight') + plt.close() + return model_results3 + +def brent_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsize=5,dataset='dataset',time = '2024-07-30',reportname='report.pdf',sqlitedb='jbsh_yuanyou.db'): + global y + # 创建内容对应的空列表 + content = list() + + ### 添加标题 + content.append(Graphs.draw_title(f'{y}{time}预测报告')) + + ### 预测结果 + content.append(Graphs.draw_little_title('一、预测结果:')) + # 添加图片 + # 找出后缀是历史价格-预测值.png的图片 + # import glob + # imgs = glob.glob(os.path.join(dataset,'*历史价格-预测值.png')) + # for img in imgs: + # content.append(Graphs.draw_img(img)) + content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png'))) + + # 取df中y列为空的行 + import pandas as pd + df = pd.read_csv(os.path.join(dataset,'predict.csv'),encoding='gbk') + df_true = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 获取预测日期对应的真实值 + df_true = df_true[['ds','y']] + eval_df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + # 按评估指标排序,取前五 + fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用 + # 取 fivemodels_list 和 ds 列 + df = df[['ds'] + fivemodels_list.tolist() ] + # 拼接预测日期对应的真实值 + df = pd.merge(df, df_true, on='ds', how='left') + # 删除全部为nan的列 + df = df.dropna(how='all', axis=1) + # 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入 + num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])] + for col in num_cols: + df[col] = df[col].astype(float).round(2) + # 添加最大值、最小值、平均值三列 + df['平均值'] = df[num_cols].mean(axis=1).round(2) + df['最大值'] = df[num_cols].max(axis=1) + df['最小值'] = df[num_cols].min(axis=1) + # df转置 + df = df.T + # df重置索引 + df = df.reset_index() + # 添加预测值表格 + data = df.values.tolist() + col_width = 500/len(df.columns) + content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) + df = pd.read_csv(os.path.join(dataset,'testandpredict_groupby.csv'),encoding='utf-8') + df4 = df.copy() # 计算偏差率使用 + # 计算模型偏差率 + #计算各列对于y列的差值百分比 + df3 = pd.DataFrame() # 存储偏差率 + + # 删除有null的行 + df4 = df4.dropna() + df3['ds'] = df4['ds'] + for col in fivemodels_list: + df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2) + # 找出决定系数前五的偏差率 + df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:] + # 找出上一预测区间的时间 + stime = df3['ds'].iloc[0] + etime = df3['ds'].iloc[-1] + # 添加偏差率表格 + fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 + content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) + # # 添加偏差率表格 + df3 = df3.T + df3 = df3.reset_index() + data = df3.values.tolist() + col_width = 500/len(df3.columns) + content.append(Graphs.draw_table(col_width,*data)) + + + content.append(Graphs.draw_little_title('三、预测过程解析:')) + ### 特征、模型、参数配置 + content.append(Graphs.draw_little_title('模型选择:')) + content.append(Graphs.draw_text(f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) + content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。')) + content.append(Graphs.draw_little_title('指标情况:')) + with open(os.path.join(dataset,'特征频度统计.txt'),encoding='utf-8') as f: + for line in f.readlines(): + content.append(Graphs.draw_text(line)) + + data = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 计算相关系数用 + df_zhibiaofenlei = loadcsv(os.path.join(dataset,'特征处理后的指标名称及分类.csv')) # 气泡图用 + df_zhibiaoshuju = data.copy() # 气泡图用 + + # 绘制特征相关气泡图 + content.append(Graphs.draw_text('气泡图中,横轴为指标名称,纵轴为指标名称,面积越大表示相关性越大,面积越小表示相关性越小。')) + grouped = df_zhibiaofenlei.groupby('指标分类') + for name, group in grouped: + cols = group['指标名称'].tolist() + for n in range(0, len(cols), 10): + logger.info(f'开始绘制{name}类指标{n}的气泡图') + cols_subset = cols[n:n+10] + feature_names = ['y'] + cols_subset + correlation_matrix = df_zhibiaoshuju[feature_names].corr() + plt.figure(figsize=(10, 10)) + for i in range(len(feature_names)): + for j in range(len(feature_names)): + plt.scatter(i, j, s=abs(correlation_matrix.iloc[i, j]) * 1000, c=correlation_matrix.iloc[i, j], cmap='coolwarm', marker='o') + for i in range(len(feature_names)): + for j in range(len(feature_names)): + plt.text(i, j, f'{correlation_matrix.iloc[i, j]:.2f}', ha='center', va='center', color='black') + plt.xticks(range(len(feature_names)), feature_names, rotation=90) + plt.yticks(range(len(feature_names)), feature_names) + plt.title(f'{name}类指标{n}') + plt.xlabel('指标名称') + plt.ylabel('指标名称') + plt.savefig(os.path.join(dataset, f'{name}{n}气泡图.png'), bbox_inches='tight') + plt.close() + content.append(Graphs.draw_img(os.path.join(dataset,f'{name}{n}气泡图.png'))) + logger.info(f'绘制指标相关性气泡图结束') + + # 计算特征相关性 + data.rename(columns={y: 'y'}, inplace=True) + data['ds'] = pd.to_datetime(data['ds']) + data.drop(columns=['ds'], inplace=True) + # 创建一个空的 DataFrame 来保存相关系数 + correlation_df = pd.DataFrame(columns=['Feature', 'Correlation']) + # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 DataFrame 中 + for col in data.columns: + if col!= 'y': + pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1] + spearman_correlation, _ = spearmanr(data[col], data['y']) + new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)} + correlation_df = correlation_df._append(new_row, ignore_index=True) + + correlation_df.drop('Correlation', axis=1, inplace=True) + correlation_df.dropna(inplace=True) + correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False) + + data = correlation_df['Pearson_Correlation'].values.tolist() + # 生成 -1 到 1 的 20 个区间 + bins = np.linspace(-1, 1, 21) + # 计算每个区间的统计数(这里是区间内数据的数量) + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + + #设置画布大小 + plt.figure(figsize=(10, 6)) + # 绘制直方图 + plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) + + # 添加标题和坐标轴标签 + plt.title('皮尔逊相关系数分布图') + plt.xlabel('区间') + plt.ylabel('统计数') + plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png')) + plt.close() + + + #设置画布大小 + plt.figure(figsize=(10, 6)) + data = correlation_df['Spearman_Correlation'].values.tolist() + # 计算每个区间的统计数(这里是区间内数据的数量) + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + + # 绘制直方图 + plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) + + # 添加标题和坐标轴标签 + plt.title('斯皮尔曼相关系数分布图') + plt.xlabel('区间') + plt.ylabel('统计数') + plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png')) + plt.close() + content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:')) + # 皮尔逊正相关 不相关 负相关 的表格 + content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png'))) + content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) + content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) + content.append(Graphs.draw_text(''' + 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) + content.append(Graphs.draw_text('''当前特征中正相关前十的有:''')) + top10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature'].to_list() + top10 = ','.join(top10_columns) + content.append(Graphs.draw_text(f'''{top10}''')) + # 获取特征的近一月值 + feature_data_df = pd.read_csv(f'dataset/指标数据添加时间特征.csv', parse_dates=['ds']).tail(20) + feature_df = feature_data_df[['ds','y']+top10_columns] + # 遍历X每一列,和yy画散点图 , + for i, col in enumerate(feature_df.columns): + print(f'正在绘制第{i+1}个特征{col}与价格散点图...') + if col not in ['ds', 'y']: + fig, ax1 = plt.subplots(figsize=(10, 6)) + # 在第一个坐标轴上绘制数据 + ax1.plot(feature_df['ds'], feature_df['y'], 'b-') + ax1.set_xlabel('日期') + ax1.set_ylabel('y', color='b') + ax1.tick_params('y', colors='b') + # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(1,len(feature_df),2): + value = feature_df['y'].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) + # 创建第二个坐标轴 + ax2 = ax1.twinx() + # 在第二个坐标轴上绘制数据 + line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-') + ax2.set_ylabel(col, color='r') + ax2.tick_params('y', colors='r') + # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(0,len(feature_df),2): + value = feature_df[col].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) + # 添加标题 + plt.title(col) + # 设置横坐标为日期格式并自动调整 + locator = mdates.AutoDateLocator() + formatter = mdates.AutoDateFormatter(locator) + ax1.xaxis.set_major_locator(locator) + ax1.xaxis.set_major_formatter(formatter) + # 文件名特殊字符处理 + col = col.replace('*', '-') + col = col.replace(':', '-') + plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) + plt.close() + + content.append(Graphs.draw_text('''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) + content.append(Graphs.draw_text('''当前特征中负相关前十的有:''')) + tail10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=True).head(10)['Feature'].to_list() + top10 = ','.join(tail10_columns) + content.append(Graphs.draw_text(f'''{top10}''')) + # 获取特征的近一周值 + feature_df = feature_data_df[['ds','y']+tail10_columns] + # 遍历X每一列,和yy画散点图 , + for i, col in enumerate(feature_df.columns): + print(f'正在绘制第{i+1}个特征{col}与价格散点图...') + if col not in ['ds', 'y']: + fig, ax1 = plt.subplots(figsize=(10, 6)) + # 在第一个坐标轴上绘制数据 + ax1.plot(feature_df['ds'], feature_df['y'], 'b-') + ax1.set_xlabel('日期') + ax1.set_ylabel('y', color='b') + ax1.tick_params('y', colors='b') + # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(len(feature_df)): + if j%2 == 1: + value = feature_df['y'].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) + # 创建第二个坐标轴 + ax2 = ax1.twinx() + # 在第二个坐标轴上绘制数据 + line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-') + ax2.set_ylabel(col, color='r') + ax2.tick_params('y', colors='r') + # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(1,len(feature_df),2): + value = feature_df[col].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) + # 添加标题 + plt.title(col) + # 设置横坐标为日期格式并自动调整 + locator = mdates.AutoDateLocator() + formatter = mdates.AutoDateFormatter(locator) + ax1.xaxis.set_major_locator(locator) + ax1.xaxis.set_major_formatter(formatter) + # 文件名特殊字符处理 + col = col.replace('*', '-') + col = col.replace(':', '-') + col = col.replace('/', '-') + plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) + plt.close() + content.append(Graphs.draw_text('''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) + content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:')) + # 皮尔逊正相关 不相关 负相关 的表格 + content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png'))) + content.append(Graphs.draw_text('斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) + content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。')) + content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。')) + content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;')) + content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:''')) + top10_columns = correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature'].to_list() + top10 = ','.join(top10_columns) + content.append(Graphs.draw_text(f'''{top10}''')) + + feature_df = feature_data_df[['ds','y']+top10_columns] + # 遍历X每一列,和yy画散点图 , + for i, col in enumerate(feature_df.columns): + print(f'正在绘制第{i+1}个特征{col}与价格散点图...') + if col not in ['ds', 'y']: + fig, ax1 = plt.subplots(figsize=(10, 6)) + # 在第一个坐标轴上绘制数据 + ax1.plot(feature_df['ds'], feature_df['y'], 'b-') + ax1.set_xlabel('日期') + ax1.set_ylabel('y', color='b') + ax1.tick_params('y', colors='b') + # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(1,len(feature_df),2): + value = feature_df['y'].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) + # 创建第二个坐标轴 + ax2 = ax1.twinx() + # 在第二个坐标轴上绘制数据 + line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-') + ax2.set_ylabel(col, color='r') + ax2.tick_params('y', colors='r') + # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(0,len(feature_df),2): + value = feature_df[col].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) + # 添加标题 + plt.title(col) + # 设置横坐标为日期格式并自动调整 + locator = mdates.AutoDateLocator() + formatter = mdates.AutoDateFormatter(locator) + ax1.xaxis.set_major_locator(locator) + ax1.xaxis.set_major_formatter(formatter) + # 文件名特殊字符处理 + col = col.replace('*', '-') + col = col.replace(':', '-') + plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) + plt.close() + + content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;')) + content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:''')) + tail10_columns = correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature'].to_list() + top10 = ','.join(tail10_columns) + content.append(Graphs.draw_text(f'''{top10}''')) + # 获取特征的近一周值 + feature_df = feature_data_df[['ds','y']+tail10_columns] + # 遍历X每一列,和yy画散点图 , + for i, col in enumerate(feature_df.columns): + print(f'正在绘制第{i+1}个特征{col}与价格散点图...') + if col not in ['ds', 'y']: + fig, ax1 = plt.subplots(figsize=(10, 6)) + # 在第一个坐标轴上绘制数据 + ax1.plot(feature_df['ds'], feature_df['y'], 'b-') + ax1.set_xlabel('日期') + ax1.set_ylabel('y', color='b') + ax1.tick_params('y', colors='b') + # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(len(feature_df)): + if j%2 == 1: + value = feature_df['y'].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) + # 创建第二个坐标轴 + ax2 = ax1.twinx() + # 在第二个坐标轴上绘制数据 + line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-') + ax2.set_ylabel(col, color='r') + ax2.tick_params('y', colors='r') + # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(1,len(feature_df),2): + value = feature_df[col].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) + # 添加标题 + plt.title(col) + # 设置横坐标为日期格式并自动调整 + locator = mdates.AutoDateLocator() + formatter = mdates.AutoDateFormatter(locator) + ax1.xaxis.set_major_locator(locator) + ax1.xaxis.set_major_formatter(formatter) + # 文件名特殊字符处理 + col = col.replace('*', '-') + col = col.replace(':', '-') + plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) + plt.close() + content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。')) + content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。')) + content.append(Graphs.draw_little_title('模型选择:')) + content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) + + ### 读取模型简介 + with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f: + for line in f: + line_split = line.strip().split('--') + if line_split[0] in fivemodels_list: + for introduction in line_split: + content.append(Graphs.draw_text(introduction)) + + content.append(Graphs.draw_little_title('模型评估:')) + + df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + # 判断 df 的数值列转为float + for col in eval_df.columns: + if col not in ['模型(Model)']: + eval_df[col] = eval_df[col].astype(float) + eval_df[col] = eval_df[col].round(3) + # 筛选 fivemodels_list.tolist() 的行 + eval_df = eval_df[eval_df['模型(Model)'].isin(fivemodels_list)] + # df转置 + eval_df = eval_df.T + # df重置索引 + eval_df = eval_df.reset_index() + eval_df = eval_df.T + # # 添加表格 + data = eval_df.values.tolist() + col_width = 500/len(eval_df.columns) + content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_text('评估指标释义:')) + content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('模型拟合:')) + # 添加图片 + content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png'))) + + # 附1,特征列表 + content.append(Graphs.draw_little_title('附1、特征列表:')) + df_fuyi = pd.read_csv(os.path.join(dataset,'特征频度统计.csv'),encoding='utf-8') + for col in df_fuyi.columns: + fuyi = df_fuyi[col] + fuyi = fuyi.dropna() + content.append(Graphs.draw_text(f'{col}:')) + for i in range(len(fuyi)): + content.append(Graphs.draw_text(f'{i+1}、{fuyi[i]}')) + + + + ### 生成pdf文件 + doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter) + # doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter) + doc.build(content) + # pdf 上传到数字化信息平台 + # 读取pdf并转为base64 + try: + if is_update_report: + with open(os.path.join(dataset,reportname), 'rb') as f: + base64_data = base64.b64encode(f.read()).decode('utf-8') + upload_data["data"]["fileBase64"] = base64_data + upload_data["data"]["fileName"] = reportname + token = get_head_auth_report() + upload_report_data(token, upload_data) + except TimeoutError as e: + print(f"请求超时: {e}") + +def pp_export_pdf(num_indicators=475,num_models=21, num_dayindicator=202,inputsize=5,dataset='dataset',time = '2024-07-30',reportname='report.pdf'): + global y + # 创建内容对应的空列表 + content = list() + + ### 添加标题 + content.append(Graphs.draw_title(f'{y}{time}预测报告')) + + ### 预测结果 + content.append(Graphs.draw_little_title('一、预测结果:')) + # 添加图片 + # 找出后缀是历史价格-预测值.png的图片 + # import glob + # imgs = glob.glob(os.path.join(dataset,'*历史价格-预测值.png')) + # for img in imgs: + # content.append(Graphs.draw_img(img)) + content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png'))) + + # 取df中y列为空的行 + import pandas as pd + df = pd.read_csv(os.path.join(dataset,'predict.csv'),encoding='gbk') + df_true = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') # 获取预测日期对应的真实值 + df_true = df_true[['ds','y']] + eval_df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + # 按评估指标排序,取前五 + fivemodels_list = eval_df['模型(Model)'].values # 列表形式,后面当作列名索引使用 + # 取 fivemodels_list 和 ds 列 + df = df[['ds'] + fivemodels_list.tolist() ] + # 拼接预测日期对应的真实值 + df = pd.merge(df, df_true, on='ds', how='left') + # 删除全部为nan的列 + df = df.dropna(how='all', axis=1) + # 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入 + num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])] + for col in num_cols: + df[col] = df[col].astype(float).round(2) + # 添加最大值、最小值、平均值三列 + df['平均值'] = df[num_cols].mean(axis=1).round(2) + df['最大值'] = df[num_cols].max(axis=1) + df['最小值'] = df[num_cols].min(axis=1) + # df转置 + df = df.T + # df重置索引 + df = df.reset_index() + # 添加预测值表格 + data = df.values.tolist() + col_width = 500/len(df.columns) + content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) + df = pd.read_csv(os.path.join(dataset,'testandpredict_groupby.csv'),encoding='utf-8') + df4 = df.copy() # 计算偏差率使用 + # 计算模型偏差率 + #计算各列对于y列的差值百分比 + df3 = pd.DataFrame() # 存储偏差率 + + # 删除有null的行 + df4 = df4.dropna() + df3['ds'] = df4['ds'] + for col in df.columns: + if col not in ['y','ds','index']: + df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2) + # 找出决定系数前五的偏差率 + df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:] + # 找出上一预测区间的时间 + stime = df3['ds'].iloc[0] + etime = df3['ds'].iloc[-1] + # 添加偏差率表格 + fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 + content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) + # # 添加偏差率表格 + df3 = df3.T + df3 = df3.reset_index() + data = df3.values.tolist() + col_width = 500/len(df3.columns) + content.append(Graphs.draw_table(col_width,*data)) + + + content.append(Graphs.draw_little_title('三、预测过程解析:')) + ### 特征、模型、参数配置 + content.append(Graphs.draw_little_title('模型选择:')) + content.append(Graphs.draw_text(f'本次预测使用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型:')) + content.append(Graphs.draw_text(f'使用40天的数据预测未来{inputsize}天的数据。')) + content.append(Graphs.draw_little_title('指标情况:')) + with open(os.path.join(dataset,'特征频度统计.txt'),encoding='utf-8') as f: + for line in f.readlines(): + content.append(Graphs.draw_text(line)) + + + + ### 特征工程 + # 计算特征相关性 + # 读取数据 + from scipy.stats import spearmanr + data = pd.read_csv(os.path.join(dataset,'指标数据添加时间特征.csv'),encoding='utf-8') + # 重命名预测列 + data.rename(columns={y: 'y'}, inplace=True) # 修改 + data['ds'] = pd.to_datetime(data['ds']) # 修改 + # 去掉ds列 + data.drop(columns=['ds'], inplace=True) + # 创建一个空的 DataFrame 来保存相关系数 + correlation_df = pd.DataFrame(columns=['Feature', 'Correlation']) + # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 DataFrame 中 + for col in data.columns: + if col!= 'y': + pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1] + spearman_correlation, _ = spearmanr(data[col], data['y']) + new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)} + correlation_df = correlation_df._append(new_row, ignore_index=True) + + # 删除空列 + correlation_df.drop('Correlation', axis=1, inplace=True) + correlation_df.dropna(inplace=True) + correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False) + + data = correlation_df['Pearson_Correlation'].values.tolist() + # 生成 -1 到 1 的 20 个区间 + bins = np.linspace(-1, 1, 21) + # 计算每个区间的统计数(这里是区间内数据的数量) + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + + #设置画布大小 + plt.figure(figsize=(10, 6)) + # 绘制直方图 + plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) + + # 添加标题和坐标轴标签 + plt.title('皮尔逊相关系数分布图') + plt.xlabel('区间') + plt.ylabel('统计数') + plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png')) + plt.close() + + + #设置画布大小 + plt.figure(figsize=(10, 6)) + data = correlation_df['Spearman_Correlation'].values.tolist() + # 计算每个区间的统计数(这里是区间内数据的数量) + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + + # 绘制直方图 + plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) + + # 添加标题和坐标轴标签 + plt.title('斯皮尔曼相关系数分布图') + plt.xlabel('区间') + plt.ylabel('统计数') + plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png')) + plt.close() + content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:')) + # 皮尔逊正相关 不相关 负相关 的表格 + content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png'))) + content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) + content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) + content.append(Graphs.draw_text(''' + 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) + content.append(Graphs.draw_text('''当前特征中正相关前十的有:''')) + top10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature'].to_list() + top10 = ','.join(top10_columns) + content.append(Graphs.draw_text(f'''{top10}''')) + # 获取特征的近一月值 + feature_data_df = pd.read_csv(f'dataset/填充后的特征数据.csv', parse_dates=['ds']).tail(20) + feature_df = feature_data_df[['ds','y']+top10_columns] + # feature_df['ds'] = pd.to_datetime(df['ds'], format = '%Y-%m-%d' ) + # 遍历X每一列,和yy画散点图 , + for i, col in enumerate(feature_df.columns): + print(f'正在绘制第{i+1}个特征{col}与价格散点图...') + if col not in ['ds', 'y']: + fig, ax1 = plt.subplots(figsize=(10, 6)) + # 在第一个坐标轴上绘制数据 + ax1.plot(feature_df['ds'], feature_df['y'], 'b-') + ax1.set_xlabel('日期') + ax1.set_ylabel('y', color='b') + ax1.tick_params('y', colors='b') + # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(1,len(feature_df),2): + value = feature_df['y'].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) + # 创建第二个坐标轴 + ax2 = ax1.twinx() + # 在第二个坐标轴上绘制数据 + line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-') + ax2.set_ylabel(col, color='r') + ax2.tick_params('y', colors='r') + # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(0,len(feature_df),2): + value = feature_df[col].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) + # 添加标题 + plt.title(col) + # 设置横坐标为日期格式并自动调整 + locator = mdates.AutoDateLocator() + formatter = mdates.AutoDateFormatter(locator) + ax1.xaxis.set_major_locator(locator) + ax1.xaxis.set_major_formatter(formatter) + # 文件名特殊字符处理 + col = col.replace('*', '-') + col = col.replace(':', '-') + plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) + plt.close() + + content.append(Graphs.draw_text('''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) + content.append(Graphs.draw_text('''当前特征中负相关前十的有:''')) + tail10_columns = correlation_df.sort_values(by='Pearson_Correlation',ascending=True).head(10)['Feature'].to_list() + top10 = ','.join(tail10_columns) + content.append(Graphs.draw_text(f'''{top10}''')) + # 获取特征的近一周值 + feature_df = feature_data_df[['ds','y']+tail10_columns] + # 遍历X每一列,和yy画散点图 , + for i, col in enumerate(feature_df.columns): + print(f'正在绘制第{i+1}个特征{col}与价格散点图...') + if col not in ['ds', 'y']: + fig, ax1 = plt.subplots(figsize=(10, 6)) + # 在第一个坐标轴上绘制数据 + ax1.plot(feature_df['ds'], feature_df['y'], 'b-') + ax1.set_xlabel('日期') + ax1.set_ylabel('y', color='b') + ax1.tick_params('y', colors='b') + # 在 ax1 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(len(feature_df)): + if j%2 == 1: + value = feature_df['y'].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax1.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='b', fontsize=10) + # 创建第二个坐标轴 + ax2 = ax1.twinx() + # 在第二个坐标轴上绘制数据 + line2 = ax2.plot(feature_df['ds'], feature_df[col], 'r-') + ax2.set_ylabel(col, color='r') + ax2.tick_params('y', colors='r') + # 在 ax2 上添加文本显示值,添加一定的偏移避免值与曲线重叠 + for j in range(1,len(feature_df),2): + value = feature_df[col].iloc[j] + date = feature_df['ds'].iloc[j] + offset = 1.001 + ax2.text(date, value * offset, str(round(value, 2)), ha='center', va='bottom', color='r', fontsize=10) + # 添加标题 + plt.title(col) + # 设置横坐标为日期格式并自动调整 + locator = mdates.AutoDateLocator() + formatter = mdates.AutoDateFormatter(locator) + ax1.xaxis.set_major_locator(locator) + ax1.xaxis.set_major_formatter(formatter) + # 文件名特殊字符处理 + col = col.replace('*', '-') + col = col.replace(':', '-') + plt.savefig(os.path.join(dataset, f'{col}与价格散点图.png')) + content.append(Graphs.draw_img(os.path.join(dataset, f'{col}与价格散点图.png'))) + plt.close() + content.append(Graphs.draw_text('''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) + content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:')) + # 皮尔逊正相关 不相关 负相关 的表格 + content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png'))) + content.append(Graphs.draw_text('斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) + content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。')) + content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。')) + content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;')) + content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:''')) + top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;')) + content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:''')) + top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。')) + content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。')) + content.append(Graphs.draw_little_title('模型选择:')) + content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,通过评估指标MAE从小到大排列,前5个模型的简介如下:')) + + ### 读取模型简介 + with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f: + for line in f: + line_split = line.strip().split('--') + if line_split[0] in fivemodels_list: + for introduction in line_split: + content.append(Graphs.draw_text(introduction)) + + content.append(Graphs.draw_little_title('模型评估:')) + + df = pd.read_csv(os.path.join(dataset,'model_evaluation.csv'),encoding='utf-8') + # 判断 df 的数值列转为float + for col in eval_df.columns: + if col not in ['模型(Model)']: + eval_df[col] = eval_df[col].astype(float) + eval_df[col] = eval_df[col].round(3) + # 筛选 fivemodels_list.tolist() 的行 + eval_df = eval_df[eval_df['模型(Model)'].isin(fivemodels_list)] + # df转置 + eval_df = eval_df.T + # df重置索引 + eval_df = eval_df.reset_index() + eval_df = eval_df.T + # # 添加表格 + data = eval_df.values.tolist() + col_width = 500/len(eval_df.columns) + content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_text('评估指标释义:')) + content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('模型拟合:')) + # 添加图片 + content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png'))) + + # 附1,特征列表 + content.append(Graphs.draw_little_title('附1、特征列表:')) + df_fuyi = pd.read_csv(os.path.join(dataset,'特征频度统计.csv'),encoding='utf-8') + for col in df_fuyi.columns: + fuyi = df_fuyi[col] + fuyi = fuyi.dropna() + content.append(Graphs.draw_text(f'{col}:')) + for i in range(len(fuyi)): + content.append(Graphs.draw_text(f'{i+1}、{fuyi[i]}')) + + ### 生成pdf文件 + doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter) + # doc = SimpleDocTemplate(os.path.join(dataset,'reportname.pdf'), pagesize=letter) + doc.build(content) + # pdf 上传到数字化信息平台 + # 读取pdf并转为base64 + try: + if is_update_report: + with open(os.path.join(dataset,reportname), 'rb') as f: + base64_data = base64.b64encode(f.read()).decode('utf-8') + upload_data["data"]["fileBase64"] = base64_data + upload_data["data"]["fileName"] = reportname + token = get_head_auth_report() + upload_report_data(token, upload_data) + except TimeoutError as e: + print(f"请求超时: {e}") + + + + +def tansuanli_export_pdf(num_indicators=475,num_models=22, num_dayindicator=202,inputsize=5,dataset='dataset',y='电碳价格',end_time='2024-07-30',reportname='tansuanli.pdf'): + # 创建内容对应的空列表 + content = list() + ### 添加标题 + content.append(Graphs.draw_title(f'{y}{end_time}预测报告')) + ### 预测结果 + content.append(Graphs.draw_little_title('一、预测结果:')) + content.append(Graphs.draw_img(os.path.join(dataset,'历史价格-预测值.png'))) + # 取df中y列为空的行 + from lib.dataread import loadcsv + df = loadcsv(os.path.join(dataset,'predict.csv')) + df_true = loadcsv(os.path.join(dataset,'指标数据添加时间特征.csv')) # 获取预测日期对应的真实值 + df_true = df_true[['ds','y']] + eval_df = loadcsv(os.path.join(dataset,'model_evaluation.csv')) + # 按评估指标排序,取前五 + fivemodels_list = eval_df['模型(Model)'].values[:5] # 列表形式,后面当作列名索引使用 + # 取 fivemodels_list 和 ds 列 + df = df[['ds'] + fivemodels_list.tolist() ] + # 拼接预测日期对应的真实值 + df = pd.merge(df, df_true, on='ds', how='left') + # 删除全部为nan的列 + df = df.dropna(how='all', axis=1) + # 选择除 'ds' 列外的数值列,并进行类型转换和四舍五入 + num_cols = [col for col in df.columns if col!= 'ds' and pd.api.types.is_numeric_dtype(df[col])] + for col in num_cols: + df[col] = df[col].astype(float).round(2) + # 添加预测每日的最大值、最小值、平均值三列 + df['平均值'] = df[num_cols].mean(axis=1).round(2) + df['最大值'] = df[num_cols].max(axis=1) + df['最小值'] = df[num_cols].min(axis=1) + # 添加模型预测周期内的最大值、最小值、平均值三行 + # 计算列的统计值 + mean_values = df[num_cols].mean(axis=0).round(2) + max_values = df[num_cols].max(axis=0) + min_values = df[num_cols].min(axis=0) + # 创建一个新的 DataFrame 来存储统计行 + stats_row = pd.DataFrame([mean_values, max_values, min_values], index=[0,1,2]) + stats_row['ds'] = ['平均值', '最大值', '最小值'] + # 将统计行添加到原始 DataFrame + df = pd.concat([df, stats_row], axis=0) + # df替换nan 为 '--' + df = df.fillna('--') + # df转置 + df = df.T + # df重置索引 + df = df.reset_index() + # 添加预测值表格 + data = df.values.tolist() + col_width = 500/len(df.columns) + content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_little_title('二、上一预测周期偏差率分析:')) + df = loadcsv(os.path.join(dataset,'testandpredict_groupby.csv')) + df4 = df.copy() # 计算偏差率使用 + # 计算模型偏差率 + #计算各列对于y列的差值百分比 + df3 = pd.DataFrame() # 存储偏差率 + + # 删除有null的行 + df4 = df4.dropna() + df3['ds'] = df4['ds'] + for col in df.columns: + if col not in ['y','ds','index']: + df3[col] = round(abs(df4[col] - df4['y']) / df4['y'] * 100,2) + # 找出决定系数前五的偏差率 + df3 = df3[['ds']+fivemodels_list.tolist()][-inputsize:] + # 找出上一预测区间的时间 + stime = df3['ds'].iloc[0] + etime = df3['ds'].iloc[-1] + # 添加偏差率表格 + fivemodels = '、'.join(eval_df['模型(Model)'].values[:5]) # 字符串形式,后面写入字符串使用 + content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练,使用评估结果MAE前五的模型分别是 {fivemodels} ,模型上一预测区间 {stime} -- {etime}的偏差率(%)分别是:')) + # # 添加偏差率表格 + df3 = df3.T + df3 = df3.reset_index() + df3 = df3.T + data = df3.values.tolist() + col_width = 500/len(df3.columns) + content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_little_title('三、预测过程解析:')) + ### 特征、模型、参数配置 + content.append(Graphs.draw_text(f'本次预测使用了给定的28个指标(列名重复的排除后)作为特征,应用了一个专门收集时间序列的NeuralForecast库中的{num_models}个模型。')) + content.append(Graphs.draw_text(f'使用10天的数据预测未来{inputsize}天的数据。')) + content.append(Graphs.draw_little_title('指标情况:')) + content.append(Graphs.draw_text(' 指标频度包括')) + # 添加频度统计表格 + pindu_df = loadcsv(os.path.join(dataset,'特征频度统计.csv')) + pindu_df.fillna('-', inplace=True) + pindu_df = pindu_df.T + pindu_df = pindu_df.reset_index() + pindu_df = pindu_df.T + data = pindu_df.values.tolist() + col_width = 500/len(pindu_df.columns) + content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_text(f'从指标特征的频度信息来看,月度指标占比最高,而我们需要进行预测的指标为日度的,所以本数据集中月度和周度指标需要进行插值处理。')) + content.append(Graphs.draw_text(' 数据特征工程:')) + content.append(Graphs.draw_text('1. 数据日期排序,新日期在最后')) + content.append(Graphs.draw_text('2. 删除空列,特征数据列没有值,就删除')) + content.append(Graphs.draw_text('3. 周度、月度特征填充为日度数据,填充规则:')) + content.append(Graphs.draw_text(' -- 向后填充,举例:假设周五出现一个周度指标数据,那么在这之前的数据用上周五的数据')) + content.append(Graphs.draw_text(' -- 向前填充,举例:采集数据开始日期为2018年1月1日,那么周度数据可能是2018年1月3日,那么3日的数据向前填充,使1日2日都有数值')) + content.append(Graphs.draw_text(f'以上处理其实并不合理,但结合我们想要的结果,我们选择了这种处理方式。')) + content.append(Graphs.draw_text(f'一般来讲,指标数据的频度和预测列是一致的,我们可以考虑预测月度的目标列,不过这样的话,月度数据太少了,不足以用来训练模型。')) + + ### 特征工程 + # 预测列分析 + content.append(Graphs.draw_text(' 电碳价格自相关ACF和偏自相关PACF分析:')) + content.append(Graphs.draw_img(os.path.join(dataset,'指标数据自相关图.png'))) + content.append(Graphs.draw_img(os.path.join(dataset,'指标数据偏自相关图.png'))) + content.append(Graphs.draw_text(' 解读:')) + content.append(Graphs.draw_text(' 自相关函数的取值范围为 [-1, 1]。正值表示信号在不同时间点之间具有正相关性,负值表示信号具有负相关性,而 0 表示信号在不同时间点之间不相关。 ')) + content.append(Graphs.draw_text(' 偏自相关函数(PACF)则是在控制了中间的滞后项影响后,特定滞后项与当前项的相关性。 ')) + content.append(Graphs.draw_text(' 当前目标列表现出的 ACF 呈现出拖尾的特征,而 PACF 在1个滞后阶数后截尾,这说明目标值适合使用自回归(AR)模型 ')) + content.append(Graphs.draw_text(' 数据特征可视化分析:')) + # 找出所有后缀为散点图.png的文件 + import glob + scatter_files = glob.glob(os.path.join(dataset,'*散点图.png')) + for file in scatter_files: + content.append(Graphs.draw_img(file)) + content.append(Graphs.draw_text(' 解读:')) + content.append(Graphs.draw_text(' 观察特征与目标列的散点图,我们可以直观的感受到特征与我们要预测的列没有明显的趋势相关,需要考虑选取的特征合理。 ')) + content.append(Graphs.draw_text(' 数据特征相关性分析:')) + # 计算特征相关性 + # 读取数据 + from scipy.stats import spearmanr + data = loadcsv(os.path.join(dataset,'指标数据添加时间特征.csv')) + # 重命名预测列 + data.rename(columns={y: 'y'}, inplace=True) # 修改 + from lib.tools import dateConvert + data = dateConvert(data) # 修改 + # 去掉ds列 + data.drop(columns=['ds'], inplace=True) + # 创建一个空的 DataFrame 来保存相关系数 + correlation_df = pd.DataFrame(columns=['Feature', 'Correlation']) + # 计算各特征与目标列的皮尔逊相关系数,并保存到新的 DataFrame 中 + for col in data.columns: + if col!= 'y': + pearson_correlation = np.corrcoef(data[col], data['y'])[0, 1] + spearman_correlation, _ = spearmanr(data[col], data['y']) + new_row = {'Feature': col, 'Pearson_Correlation': round(pearson_correlation,3), 'Spearman_Correlation': round(spearman_correlation,2)} + correlation_df = correlation_df._append(new_row, ignore_index=True) + + # 删除空列 + correlation_df.drop('Correlation', axis=1, inplace=True) + correlation_df.dropna(inplace=True) + correlation_df.to_csv(os.path.join(dataset,'指标相关性分析.csv'), index=False) + data = correlation_df['Pearson_Correlation'].values.tolist() + # 生成 -1 到 1 的 20 个区间 + bins = np.linspace(-1, 1, 21) + # 计算每个区间的统计数(这里是区间内数据的数量) + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + #设置画布大小 + plt.figure(figsize=(10, 6)) + # 绘制直方图 + plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) + # 添加标题和坐标轴标签 + plt.title('皮尔逊相关系数分布图') + plt.xlabel('区间') + plt.ylabel('统计数') + plt.savefig(os.path.join(dataset, '皮尔逊相关性系数.png')) + plt.close() + #设置画布大小 + plt.figure(figsize=(10, 6)) + data = correlation_df['Spearman_Correlation'].values.tolist() + # 计算每个区间的统计数(这里是区间内数据的数量) + hist_values = [np.sum((data >= bins[i]) & (data < bins[i + 1])) for i in range(len(bins) - 1)] + # 绘制直方图 + plt.bar(bins[:-1], hist_values, width=(bins[1] - bins[0])) + # 添加标题和坐标轴标签 + plt.title('斯皮尔曼相关系数分布图') + plt.xlabel('区间') + plt.ylabel('统计数') + plt.savefig(os.path.join(dataset, '斯皮尔曼相关性系数.png')) + plt.close() + content.append(Graphs.draw_text(f'指标相关性分析--皮尔逊相关系数:')) + # 皮尔逊正相关 不相关 负相关 的表格 + content.append(Graphs.draw_img(os.path.join(dataset,'皮尔逊相关性系数.png'))) + content.append(Graphs.draw_text('''皮尔逊相关系数说明:''')) + content.append(Graphs.draw_text('''衡量两个特征之间的线性相关性。''')) + content.append(Graphs.draw_text(''' + 相关系数为1:表示两个变量之间存在完全正向的线性关系,即当一个变量增加时,另一个变量也相应增加,且变化是完全一致的。''')) + content.append(Graphs.draw_text('''当前特征中正相关前十的有:''')) + top10 = ','.join(correlation_df.sort_values(by='Pearson_Correlation',ascending=False).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('''相关系数为-1:表示两个变量之间存在完全负向的线性关系,即当一个变量增加时,另一个变量会相应减少,且变化是完全相反的''')) + content.append(Graphs.draw_text('''当前特征中负相关前十的有:''')) + top10 = ','.join(correlation_df.sort_values(by='Pearson_Correlation',ascending=True).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('''相关系数接近0:表示两个变量之间不存在线性关系,即它们的变化不会随着对方的变化而变化。''')) + content.append(Graphs.draw_text(f'指标相关性分析--斯皮尔曼相关系数:')) + # 皮尔逊正相关 不相关 负相关 的表格 + content.append(Graphs.draw_img(os.path.join(dataset,'斯皮尔曼相关性系数.png'))) + content.append(Graphs.draw_text('斯皮尔曼相关系数(Spearmans rank correlation coefficient)是一种用于衡量两个变量之间的单调关系(不一定是线性关系)的统计指标。')) + content.append(Graphs.draw_text('它的计算基于变量的秩次(即变量值的排序位置)而非变量的原始值。')) + content.append(Graphs.draw_text('斯皮尔曼相关系数的取值范围在 -1 到 1 之间。')) + content.append(Graphs.draw_text('当系数为 1 时,表示两个变量之间存在完全正的单调关系;')) + content.append(Graphs.draw_text('''当前特征中正单调关系前十的有:''')) + top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=False).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('当系数为 -1 时,表示存在完全负的单调关系;')) + content.append(Graphs.draw_text('''当前特征中负单调关系前十的有:''')) + top10 = ','.join(correlation_df.sort_values(by='Spearman_Correlation',ascending=True).head(10)['Feature']) + content.append(Graphs.draw_text(f'''{top10}''')) + content.append(Graphs.draw_text('当系数为 0 时,表示两个变量之间不存在单调关系。')) + content.append(Graphs.draw_text('与皮尔逊相关系数相比,斯皮尔曼相关系数对于数据中的异常值不敏感,更适用于处理非线性关系或存在极端值的数据。')) + content.append(Graphs.draw_little_title('模型选择:')) + content.append(Graphs.draw_text(f'预测使用了{num_models}个模型进行训练拟合,模型的简介如下:')) + + ### 读取模型简介 + with open(os.path.join(dataset,'model_introduction.txt'), 'r', encoding='utf-8') as f: + for line in f: + line_split = line.strip().split('--') + # if line_split[0] in fivemodels_list: + for introduction in line_split: + content.append(Graphs.draw_text(introduction)) + + content.append(Graphs.draw_little_title('模型评估:')) + content.append(Graphs.draw_text(f'通过评估指标MAE从小到大排列,前5个模型的评估详情如下:')) + df = loadcsv(os.path.join(dataset,'model_evaluation.csv')) + # 判断 df 的数值列转为float + for col in eval_df.columns: + if col not in ['模型(Model)']: + eval_df[col] = eval_df[col].astype(float) + eval_df[col] = eval_df[col].round(3) + # 筛选 fivemodels_list.tolist() 的行 + eval_df = eval_df[eval_df['模型(Model)'].isin(fivemodels_list)] + # df转置 + eval_df = eval_df.T + # df重置索引 + eval_df = eval_df.reset_index() + eval_df = eval_df.T + # # 添加表格 + data = eval_df.values.tolist() + col_width = 500/len(eval_df.columns) + content.append(Graphs.draw_table(col_width,*data)) + content.append(Graphs.draw_text('评估指标释义:')) + content.append(Graphs.draw_text('1. 均方根误差(RMSE):均方根误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值的差值的平方,然后对这些平方差求平均值,最后取平均值的平方根。取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('2. 平均绝对误差(MAE):平均绝对误差是衡量预测值与实际值之间误差的一种方法,对预测值与真实值之间差值的绝对值进行求和,然后除以样本数量。取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('3. 平均平方误差(MSE):平均平方误差是衡量预测值与实际值之间误差的一种方法,先计算预测值与真实值之差的平方,然后对这些平方差求平均值。取值越小,误差越小,预测效果越好。')) + content.append(Graphs.draw_text('模型拟合:')) + # 添加图片 + content.append(Graphs.draw_img(os.path.join(dataset,'预测值与真实值对比图.png'))) + ### 生成pdf文件 + doc = SimpleDocTemplate(os.path.join(dataset,reportname), pagesize=letter) + doc.build(content) diff --git a/pushdata.py b/pushdata.py new file mode 100644 index 0000000..73c422a --- /dev/null +++ b/pushdata.py @@ -0,0 +1,104 @@ +# 读取配置 +from config_jingbo import * +# from config_tansuanli import * +from lib.tools import * +from lib.dataread import * +from models.nerulforcastmodels import ex_Model,model_losss,brent_export_pdf,tansuanli_export_pdf +from models.lstmmodels import ex_Lstm_M,ex_Lstm +from models.grumodels import ex_GRU +import glob +import torch +torch.set_float32_matmul_precision("high") + +if __name__ == '__main__': + + signature = BinanceAPI(APPID, SECRET) + etadata = EtaReader(signature=signature, + classifylisturl = classifylisturl, + classifyidlisturl=classifyidlisturl, + edbcodedataurl=edbcodedataurl, + edbcodelist=edbcodelist, + edbdatapushurl = edbdatapushurl, + edbdeleteurl = edbdeleteurl, + edbbusinessurl = edbbusinessurl + ) + + models = [ + 'NHITS', + 'Informer', + 'LSTM', + 'iTransformer', + 'TSMixer', + 'TSMixerx', + 'PatchTST', + 'RNN', + 'GRU', + 'TCN', + 'BiTCN', + 'DilatedRNN', + 'MLP', + 'DLinear', + 'NLinear', + 'TFT', + 'FEDformer', + 'StemGNN', + 'MLPMultivariate', + 'TiDE', + 'DeepNPT'] + + # eta自由数据指标编码 + modelsindex = { + 'NHITS': 'SELF0000001', + 'Informer':'SELF0000057', + 'LSTM':'SELF0000058', + 'iTransformer':'SELF0000059', + 'TSMixer':'SELF0000060', + 'TSMixerx':'SELF0000061', + 'PatchTST':'SELF0000062', + 'RNN':'SELF0000063', + 'GRU':'SELF0000064', + 'TCN':'SELF0000065', + 'BiTCN':'SELF0000066', + 'DilatedRNN':'SELF0000067', + 'MLP':'SELF0000068', + 'DLinear':'SELF0000069', + 'NLinear':'SELF0000070', + 'TFT':'SELF0000071', + 'FEDformer':'SELF0000072', + 'StemGNN':'SELF0000073', + 'MLPMultivariate':'SELF0000074', + 'TiDE':'SELF0000075', + 'DeepNPT':'SELF0000076' + } + + # df_predict = pd.read_csv('dataset/predict.csv',encoding='gbk') + # # df_predict.rename(columns={'ds':'Date'},inplace=True) + # for m in modelsindex.keys(): + # list = [] + # for date,value in zip(df_predict['ds'],df_predict[m]): + # list.append({'Date':date,'Value':value}) + # data['DataList'] = list + # data['IndexCode'] = modelsindex[m] + # data['IndexName'] = f'价格预测{m}模型' + # data['Remark'] = m + # # print(data['DataList']) + # etadata.push_data(data) + + # 删除指标 + # IndexCodeList = ['SELF0000055'] + # for i in range(1,57): + # if i < 10 : i = f'0{i}' + # IndexCodeList.append(f'SELF00000{i}') + # print(IndexCodeList) + # etadata.del_zhibiao(IndexCodeList) + + # 删除特定日期的值 + indexcodelist = modelsindex.values() + for indexcode in indexcodelist: + data = { + "IndexCode": indexcode, #指标编码 + "StartDate": "2020-04-20", #指标需要删除的开始日期(>=),如果开始日期和结束日期相等,那么就是删除该日期 + "EndDate": "2024-05-28" #指标需要删除的结束日期(<=),如果开始日期和结束日期相等,那么就是删除该日期 + } + + # etadata.del_business(data) \ No newline at end of file diff --git a/八个维度demo copy.py b/八个维度demo copy.py new file mode 100644 index 0000000..6f06890 --- /dev/null +++ b/八个维度demo copy.py @@ -0,0 +1,62 @@ +import logging +import os + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from neuralforecast import NeuralForecast +from neuralforecast.models import NHITS +from neuralforecast.utils import AirPassengersPanel +from mlforecast.utils import PredictionIntervals +from neuralforecast.losses.pytorch import DistributionLoss, MAE + +os.environ['NIXTLA_ID_AS_COL'] = '1' + + +AirPassengersPanel_train = AirPassengersPanel[AirPassengersPanel['ds'] < AirPassengersPanel['ds'].values[-12]].reset_index(drop=True) +AirPassengersPanel_test = AirPassengersPanel[AirPassengersPanel['ds'] >= AirPassengersPanel['ds'].values[-12]].reset_index(drop=True) +AirPassengersPanel_test['y'] = np.nan +AirPassengersPanel_test['y_[lag12]'] = np.nan + + +horizon = 12 +input_size = 24 + +prediction_intervals = PredictionIntervals() + +models = [NHITS(h=horizon, input_size=input_size, max_steps=100, loss=MAE(), scaler_type="robust"), + NHITS(h=horizon, input_size=input_size, max_steps=100, loss=DistributionLoss("Normal", level=[90]), scaler_type="robust")] +nf = NeuralForecast(models=models, freq='ME') +nf.fit(AirPassengersPanel_train, prediction_intervals=prediction_intervals) + + +preds = nf.predict(futr_df=AirPassengersPanel_test, level=[90]) + +fig, (ax1, ax2) = plt.subplots(2, 1, figsize = (20, 7)) +plot_df = pd.concat([AirPassengersPanel_train, preds]) + +plot_df = plot_df[plot_df['unique_id']=='Airline1'].drop(['unique_id','trend','y_[lag12]'], axis=1).iloc[-50:] + +ax1.plot(plot_df['ds'], plot_df['y'], c='black', label='True') +ax1.plot(plot_df['ds'], plot_df['NHITS'], c='blue', label='median') +ax1.fill_between(x=plot_df['ds'][-12:], + y1=plot_df['NHITS-lo-90'][-12:].values, + y2=plot_df['NHITS-hi-90'][-12:].values, + alpha=0.4, label='level 90') +ax1.set_title('AirPassengers Forecast - Uncertainty quantification using Conformal Prediction', fontsize=18) +ax1.set_ylabel('Monthly Passengers', fontsize=15) +ax1.set_xticklabels([]) +ax1.legend(prop={'size': 10}) +ax1.grid() + +ax2.plot(plot_df['ds'], plot_df['y'], c='black', label='True') +ax2.plot(plot_df['ds'], plot_df['NHITS1'], c='blue', label='median') +ax2.fill_between(x=plot_df['ds'][-12:], + y1=plot_df['NHITS1-lo-90'][-12:].values, + y2=plot_df['NHITS1-hi-90'][-12:].values, + alpha=0.4, label='level 90') +ax2.set_title('AirPassengers Forecast - Uncertainty quantification using Normal distribution', fontsize=18) +ax2.set_ylabel('Monthly Passengers', fontsize=15) +ax2.set_xlabel('Timestamp [t]', fontsize=15) +ax2.legend(prop={'size': 10}) +ax2.grid() diff --git a/八个维度demo.py b/八个维度demo.py new file mode 100644 index 0000000..74f60e5 --- /dev/null +++ b/八个维度demo.py @@ -0,0 +1,200 @@ +import pandas as pd +from datasetsforecast.long_horizon import LongHorizon + +# Change this to your own data to try the model +Y_df, _, _ = LongHorizon.load(directory='./', group='ETTm2') +Y_df['ds'] = pd.to_datetime(Y_df['ds']) + +# For this excercise we are going to take 20% of the DataSet +n_time = len(Y_df.ds.unique()) +val_size = int(.2 * n_time) +test_size = int(.2 * n_time) + +Y_df.groupby('unique_id').head(2) + +import matplotlib.pyplot as plt + +# We are going to plot the temperature of the transformer +# and marking the validation and train splits +u_id = 'HUFL' +x_plot = pd.to_datetime(Y_df[Y_df.unique_id==u_id].ds) +y_plot = Y_df[Y_df.unique_id==u_id].y.values + +x_val = x_plot[n_time - val_size - test_size] +x_test = x_plot[n_time - test_size] + +fig = plt.figure(figsize=(10, 5)) +fig.tight_layout() + +plt.plot(x_plot, y_plot) +plt.xlabel('Date', fontsize=17) +plt.ylabel('HUFL [15 min temperature]', fontsize=17) + +plt.axvline(x_val, color='black', linestyle='-.') +plt.axvline(x_test, color='black', linestyle='-.') +plt.text(x_val, 5, ' Validation', fontsize=12) +plt.text(x_test, 5, ' Test', fontsize=12) + +plt.grid() + + +from ray import tune +from neuralforecast.auto import AutoNHITS +from neuralforecast.core import NeuralForecast + +horizon = 96 # 24hrs = 4 * 15 min. + +# Use your own config or AutoNHITS.default_config +nhits_config = { + "learning_rate": tune.choice([1e-3]), # Initial Learning rate + "max_steps": tune.choice([1000]), # Number of SGD steps + "input_size": tune.choice([5 * horizon]), # input_size = multiplier * horizon + "batch_size": tune.choice([7]), # Number of series in windows + "windows_batch_size": tune.choice([256]), # Number of windows in batch + "n_pool_kernel_size": tune.choice([[2, 2, 2], [16, 8, 1]]), # MaxPool's Kernelsize + "n_freq_downsample": tune.choice([[168, 24, 1], [24, 12, 1], [1, 1, 1]]), # Interpolation expressivity ratios + "activation": tune.choice(['ReLU']), # Type of non-linear activation + "n_blocks": tune.choice([[1, 1, 1]]), # Blocks per each 3 stacks + "mlp_units": tune.choice([[[512, 512], [512, 512], [512, 512]]]), # 2 512-Layers per block for each stack + "interpolation_mode": tune.choice(['linear']), # Type of multi-step interpolation + "val_check_steps": tune.choice([100]), # Compute validation every 100 epochs + "random_seed": tune.randint(1, 10), + } + +tft_config = { + "input_size": tune.choice([horizon]), + "hidden_size": tune.choice([32]), + "n_head": tune.choice([2]), + "learning_rate": tune.loguniform(1e-4, 1e-1), + "scaler_type": tune.choice(['robust', 'standard']), + "max_steps": tune.choice([500, 1000]), + "windows_batch_size": tune.choice([32]), + "check_val_every_n_epoch": tune.choice([100]), + "random_seed": tune.randint(1, 20), +} + + +tsmixer_config = { + "input_size": input_size, # Size of input window + "max_steps": tune.choice([500, 1000, 2000]), # Number of training iterations + "val_check_steps": 100, # Compute validation every x steps + "early_stop_patience_steps": 5, # Early stopping steps + "learning_rate": tune.loguniform(1e-4, 1e-2), # Initial Learning rate + "n_block": tune.choice([1, 2, 4, 6, 8]), # Number of mixing layers + "dropout": tune.uniform(0.0, 0.99), # Dropout + "ff_dim": tune.choice([32, 64, 128]), # Dimension of the feature linear layer + "scaler_type": 'identity', + } + +tsmixerx_config = tsmixer_config.copy() +tsmixerx_config['futr_exog_list'] = ['ex_1', 'ex_2', 'ex_3', 'ex_4'] + +models = [AutoNHITS(h=horizon, + config=nhits_config, + num_samples=5), + AutoTFT(h=horizon, + loss=MAE(), + config=tft_config, + num_samples=3), + TSMixer(h=horizon, + input_size=input_size, + n_series=7, + max_steps=1000, + val_check_steps=100, + early_stop_patience_steps=5, + scaler_type='identity', + valid_loss=MAE(), + random_seed=12345678, + ), + TSMixerx(h=horizon, + input_size=input_size, + n_series=7, + max_steps=1000, + val_check_steps=100, + early_stop_patience_steps=5, + scaler_type='identity', + dropout=0.7, + valid_loss=MAE(), + random_seed=12345678, + futr_exog_list=['ex_1', 'ex_2', 'ex_3', 'ex_4'], + ), + MLPMultivariate(h=horizon, + input_size=input_size, + n_series=7, + max_steps=1000, + val_check_steps=100, + early_stop_patience_steps=5, + scaler_type='standard', + hidden_size=256, + valid_loss=MAE(), + random_seed=12345678, + ), + NHITS(h=horizon, + input_size=horizon, + max_steps=1000, + val_check_steps=100, + early_stop_patience_steps=5, + scaler_type='robust', + valid_loss=MAE(), + random_seed=12345678, + ), + AutoTSMixer(h=horizon, + n_series=7, + loss=MAE(), + config=tsmixer_config, + num_samples=10, + search_alg=HyperOptSearch(), + backend='ray', + valid_loss=MAE()) , + AutoTSMixerx(h=horizon, + n_series=7, + loss=MAE(), + config=tsmixerx_config, + num_samples=10, + search_alg=HyperOptSearch(), + backend='ray', + valid_loss=MAE()) ] + +nf = NeuralForecast( + models=models, + freq='15min') + +Y_hat_df = nf.cross_validation(df=Y_df, val_size=val_size, + test_size=test_size, n_windows=None) +nf.models[0].results.get_best_result().config +y_true = Y_hat_df.y.values +y_hat = Y_hat_df['AutoNHITS'].values + +n_series = len(Y_df.unique_id.unique()) + +y_true = y_true.reshape(n_series, -1, horizon) +y_hat = y_hat.reshape(n_series, -1, horizon) + +print('Parsed results') +print('2. y_true.shape (n_series, n_windows, n_time_out):\t', y_true.shape) +print('2. y_hat.shape (n_series, n_windows, n_time_out):\t', y_hat.shape) + + +fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(10, 11)) +fig.tight_layout() + +series = ['HUFL','HULL','LUFL','LULL','MUFL','MULL','OT'] +series_idx = 3 + +for idx, w_idx in enumerate([200, 300, 400]): + axs[idx].plot(y_true[series_idx, w_idx,:],label='True') + axs[idx].plot(y_hat[series_idx, w_idx,:],label='Forecast') + axs[idx].grid() + axs[idx].set_ylabel(series[series_idx]+f' window {w_idx}', + fontsize=17) + if idx==2: + axs[idx].set_xlabel('Forecast Horizon', fontsize=17) +plt.legend() +plt.show() +plt.close() + + +from neuralforecast.losses.numpy import mae, mse + +print('MAE: ', mae(y_hat, y_true)) +print('MSE: ', mse(y_hat, y_true)) diff --git a/原油预测定时任务,请勿关闭.py b/原油预测定时任务,请勿关闭.py new file mode 100644 index 0000000..ab89aa6 --- /dev/null +++ b/原油预测定时任务,请勿关闭.py @@ -0,0 +1,14 @@ +# 定时执行cmd命令 +import os +import time +from main import predict_main + +while True: + try: + print(time.strftime('%H:%M')) + # 判断是不是工作日且 是17:00 7:00 才执行 + if time.strftime('%A') not in ['Saturday', 'Sunday'] and time.strftime('%H:%M') in [ '18:00']: + predict_main() + time.sleep(60) + except: + pass \ No newline at end of file