数据处理添加删除起始日期没有数据的列;
This commit is contained in:
		
							parent
							
								
									488a591819
								
							
						
					
					
						commit
						52e8638a10
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -58,6 +58,7 @@ cover/ | ||||
| *.mo | ||||
| *.pot | ||||
| *.pdf | ||||
| *.png | ||||
| # Django stuff: | ||||
| *.log | ||||
| local_settings.py | ||||
|  | ||||
| @ -790,9 +790,31 @@ def datachuli(df_zhibiaoshuju, df_zhibiaoliebiao, datecol='date', end_time='', y | ||||
|     # 按时间顺序排列 | ||||
|     df.sort_values(by='ds', inplace=True) | ||||
|     df['ds'] = pd.to_datetime(df['ds']) | ||||
|      | ||||
|     # 获取 start_year 对应年份的第一天日期 | ||||
|     start_date = datetime.datetime(config.start_year, 1, 1).strftime('%Y-%m-%d') | ||||
| 
 | ||||
|     # 删除全为空值的列 | ||||
|     df = df.dropna(axis=1, how='all') | ||||
| 
 | ||||
|     for col in df.columns: | ||||
|         if col == 'ds':  # 跳过 'ds' 列 | ||||
|             continue | ||||
|         # 找到第一个非空值的索引 | ||||
|         first_valid_index = df[col].first_valid_index() | ||||
|         if first_valid_index is not None: | ||||
|             # 判断对应的 'ds' 是否大于 start_date | ||||
|             if df.loc[first_valid_index, 'ds'] > start_date: | ||||
|                 df.drop(columns=[col], inplace=True) | ||||
| 
 | ||||
| 
 | ||||
|     # 获取start_year年到end_time的数据 | ||||
|     df = df[df['ds'].dt.year >= config.start_year] | ||||
|     df = df[df['ds'] <= end_time] | ||||
| 
 | ||||
|     # # 删除第一行没有数据的列 | ||||
|     # df = df.loc[:, ~df.iloc[0].isna()] | ||||
| 
 | ||||
|     # last_update_times_df,y_last_update_time = create_feature_last_update_time(df) | ||||
|     # config.logger.info(f'删除预警的特征前数据量:{df.shape}') | ||||
|     # columns_to_drop = last_update_times_df[last_update_times_df['warning_date'] < y_last_update_time ]['feature'].values.tolist() | ||||
|  | ||||
| @ -252,6 +252,7 @@ def predict_main(): | ||||
|     reportname = f'Brent原油大模型预测--{end_time}.pdf' # 报告文件名 | ||||
|     reportname = reportname.replace(':', '-') # 替换冒号 | ||||
|     brent_export_pdf(dataset=dataset,num_models = 5 if is_fivemodels else 22,time=end_time, | ||||
|                         inputsize = global_config['horizon'], | ||||
|                 reportname=reportname,sqlitedb=sqlitedb), | ||||
| 
 | ||||
|     logger.info('制作报告end') | ||||
|  | ||||
| @ -415,8 +415,12 @@ def predict_main(): | ||||
|     title = f'{settings}--{end_time}-预测报告'  # 报告标题 | ||||
|     reportname = f'Brent原油大模型日度预测--{end_time}.pdf'  # 报告文件名 | ||||
|     reportname = reportname.replace(':', '-')  # 替换冒号 | ||||
|     brent_export_pdf(dataset=dataset, num_models=5 if is_fivemodels else 22, time=end_time, | ||||
|                      reportname=reportname, sqlitedb=sqlitedb), | ||||
|     brent_export_pdf(dataset=dataset,  | ||||
|                         num_models=5 if is_fivemodels else 22, time=end_time, | ||||
|                         reportname=reportname,  | ||||
|                         inputsize = global_config['horizon'], | ||||
|                         sqlitedb=sqlitedb | ||||
|                      ), | ||||
| 
 | ||||
|     logger.info('制作报告end') | ||||
|     logger.info('模型训练完成') | ||||
|  | ||||
| @ -429,7 +429,10 @@ def predict_main(): | ||||
|     reportname = f'Brent原油大模型月度预测--{end_time}.pdf'  # 报告文件名 | ||||
|     reportname = reportname.replace(':', '-')  # 替换冒号 | ||||
|     brent_export_pdf(dataset=dataset, num_models=5 if is_fivemodels else 22, time=end_time, | ||||
|                      reportname=reportname, sqlitedb=sqlitedb), | ||||
|                         reportname=reportname, | ||||
|                         inputsize = global_config['horizon'], | ||||
|                         sqlitedb=sqlitedb | ||||
|                       ), | ||||
| 
 | ||||
|     logger.info('制作报告end') | ||||
|     logger.info('模型训练完成') | ||||
|  | ||||
| @ -410,8 +410,13 @@ def predict_main(): | ||||
|     title = f'{settings}--{end_time}-预测报告'  # 报告标题 | ||||
|     reportname = f'Brent原油大模型周度预测--{end_time}.pdf'  # 报告文件名 | ||||
|     reportname = reportname.replace(':', '-')  # 替换冒号 | ||||
|     brent_export_pdf(dataset=dataset, num_models=5 if is_fivemodels else 22, time=end_time, | ||||
|                      reportname=reportname, sqlitedb=sqlitedb), | ||||
|     brent_export_pdf(dataset=dataset, | ||||
|                         num_models=5 if is_fivemodels else 22, | ||||
|                         time=end_time, | ||||
|                         reportname=reportname, | ||||
|                         inputsize = global_config['horizon'], | ||||
|                         sqlitedb=sqlitedb | ||||
|                       ), | ||||
| 
 | ||||
|     logger.info('制作报告end') | ||||
|     logger.info('模型训练完成') | ||||
|  | ||||
							
								
								
									
										180
									
								
								models/implementARIMA.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										180
									
								
								models/implementARIMA.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,180 @@ | ||||
| from __future__ import annotations | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| import matplotlib.pyplot as plt | ||||
| import statsmodels.api as sm | ||||
| from statsmodels.tsa.stattools import adfuller as ADF | ||||
| from statsmodels.stats.diagnostic import acorr_ljungbox | ||||
| from statsmodels.graphics.tsaplots import plot_acf, plot_pacf #ACF与PACF | ||||
| from statsmodels.tsa.arima.model import ARIMA #ARIMA模型 | ||||
| from statsmodels.graphics.api import qqplot  #qq图 | ||||
| from scipy import stats | ||||
| 
 | ||||
| import warnings | ||||
| warnings.filterwarnings("ignore") | ||||
| 
 | ||||
| 
 | ||||
| # 绘图设置(适用于mac) | ||||
| plt.rcParams['font.sans-serif'] = ['SimHei'] | ||||
| # plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  # 用来正常显示中文标签 | ||||
| plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号 | ||||
| 
 | ||||
| # 模型检测 | ||||
| def Model_checking(model) -> None: | ||||
|     # 残差检验:检验残差是否服从正态分布,画图查看,然后检验 | ||||
| 
 | ||||
|     print('------------残差检验-----------') | ||||
|     # model.resid:残差 = 实际观测值 – 模型预测值 | ||||
|     print(stats.normaltest(model.resid)) | ||||
| 
 | ||||
|     # QQ图看正态性 | ||||
|     qqplot(model.resid, line="q", fit=True) | ||||
|     plt.title("Q-Q图") | ||||
|     plt.show() | ||||
|     # 绘制直方图 | ||||
|     plt.hist(model.resid, bins=50) | ||||
|     plt.show() | ||||
| 
 | ||||
|     # 进行Jarque-Bera检验:判断数据是否符合总体正态分布 | ||||
|     jb_test = sm.stats.stattools.jarque_bera(model.resid) | ||||
|     print("==================================================") | ||||
|     print('------------Jarque-Bera检验-----------') | ||||
|     print('Jarque-Bera test:') | ||||
|     print('JB:', jb_test[0]) | ||||
|     print('p-value:', jb_test[1]) | ||||
|     print('Skew:', jb_test[2]) | ||||
|     print('Kurtosis:', jb_test[3]) | ||||
| 
 | ||||
|     # 残差序列自相关:残差序列是否独立 | ||||
|     print("==================================================") | ||||
|     print('------DW检验:残差序列自相关----') | ||||
|     print(sm.stats.stattools.durbin_watson(model.resid.values)) | ||||
| 
 | ||||
| # 使用BIC矩阵计算p和q的值 | ||||
| def cal_pqValue(D_data, diff_num=0) -> List[float]: | ||||
|     # 定阶 | ||||
|     pmax = int(len(D_data) / 10)  # 一般阶数不超过length/10 | ||||
|     qmax = int(len(D_data) / 10)  # 一般阶数不超过length/10 | ||||
|     bic_matrix = []  # BIC矩阵 | ||||
|     # 差分阶数 | ||||
|     diff_num = 2 | ||||
| 
 | ||||
|     for p in range(pmax + 1): | ||||
|         tmp = [] | ||||
|         for q in range(qmax + 1): | ||||
|             try: | ||||
|                 tmp.append(ARIMA(D_data, order=(p, diff_num, q)).fit().bic) | ||||
|             except Exception as e: | ||||
|                 print(e) | ||||
|                 tmp.append(None) | ||||
|         bic_matrix.append(tmp) | ||||
| 
 | ||||
|     bic_matrix = pd.DataFrame(bic_matrix)  # 从中可以找出最小值 | ||||
|     p, q = bic_matrix.stack().idxmin()  # 先用stack展平,然后用idxmin找出最小值位置。 | ||||
|     print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) | ||||
|     return p, q | ||||
| 
 | ||||
| # 计算时序序列模型 | ||||
| def cal_time_series(data, forecast_num=3) -> None: | ||||
|     # 绘制时序图 | ||||
|     data.plot() | ||||
|     # 存储图片 | ||||
|     plt.savefig('1.png') | ||||
|     plt.show() | ||||
| 
 | ||||
|     # 绘制自相关图 | ||||
|     plot_acf(data).show() | ||||
|     plot_acf(data).savefig('自相关图.png') | ||||
|     # 绘制偏自相关图 | ||||
|     plot_pacf(data).show() | ||||
|     plot_pacf(data).savefig('偏自相关图.png') | ||||
| 
 | ||||
|     # 时序数据平稳性检测 | ||||
|     original_ADF = ADF(data[u'deal_data']) | ||||
|     print(u'原始序列的ADF检验结果为:', original_ADF) | ||||
| 
 | ||||
|     # 对数序数据进行d阶差分运算,化为平稳时间序列 | ||||
|     diff_num = 0 # 差分阶数 | ||||
|     diff_data = data     # 差分数序数据 | ||||
|     ADF_p_value = ADF(data[u'deal_data'])[1] | ||||
|     while  ADF_p_value > 0.01: | ||||
|         diff_data = diff_data.diff(periods=1).dropna() | ||||
|         diff_num = diff_num + 1 | ||||
|         ADF_result = ADF(diff_data[u'deal_data']) | ||||
|         ADF_p_value = ADF_result[1] | ||||
|         print("ADF_p_value:{ADF_p_value}".format(ADF_p_value=ADF_p_value)) | ||||
|         print(u'{diff_num}差分的ADF检验结果为:'.format(diff_num = diff_num), ADF_result ) | ||||
| 
 | ||||
|     # 白噪声检测 | ||||
|     print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(diff_data, lags=1))  # 返回统计量和p值 | ||||
| 
 | ||||
|     # 使用AIC和BIC准则定阶q和p的值(推荐) | ||||
|     AIC = sm.tsa.stattools.arma_order_select_ic(diff_data, max_ar=4, max_ma=4, ic='aic')['aic_min_order'] | ||||
|     BIC = sm.tsa.stattools.arma_order_select_ic(diff_data, max_ar=4, max_ma=4, ic='bic')['bic_min_order'] | ||||
|     print('---AIC与BIC准则定阶---') | ||||
|     print('the AIC is{}\nthe BIC is{}\n'.format(AIC, BIC), end='') | ||||
|     p = BIC[0] | ||||
|     q = BIC[1] | ||||
| 
 | ||||
|     # 使用BIC矩阵来计算q和p的值 | ||||
|     # pq_result = cal_pqValue(diff_data, diff_num) | ||||
|     # p = pq_result[0] | ||||
|     # q = pq_result[1] | ||||
| 
 | ||||
|     # 构建时间序列模型 | ||||
|     model = ARIMA(data, order=(p, diff_num, q)).fit()  # 建立ARIMA(p, diff+num, q)模型 | ||||
|     print('模型报告为:\n', model.summary()) | ||||
|     print("预测结果:\n", model.forecast(forecast_num)) | ||||
| 
 | ||||
|     print("预测结果(详细版):\n") | ||||
|     forecast = model.get_forecast(steps=forecast_num) | ||||
|     table = pd.DataFrame(forecast.summary_frame()) | ||||
|     print(table) | ||||
| 
 | ||||
|     # 绘制残差图 | ||||
|     diff_data.plot(color='orange', title='残差图') | ||||
|     model.resid.plot(figsize=(10, 3)) | ||||
|     plt.title("残差图") | ||||
|     plt.savefig('2.png') | ||||
|     plt.show() | ||||
| 
 | ||||
|     # 模型检查 | ||||
|     Model_checking(model) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     # 数据测试1: | ||||
|     # need_data = {'2016-02': 44964.03, '2016-03': 56825.51, '2016-04': 49161.98, '2016-05': 45859.35, | ||||
|     #              '2016-06': 45098.56, | ||||
|     #              '2016-07': 45522.17, '2016-08': 57133.18, '2016-09': 49037.29, '2016-10': 43157.36, | ||||
|     #              '2016-11': 48333.17, | ||||
|     #              '2016-12': 22900.94, | ||||
|     #              '2017-01': 67057.29, '2017-02': 49985.29, '2017-03': 49771.47, '2017-04': 35757.0, '2017-05': 42914.27, | ||||
|     #              '2017-06': 44507.03, '2017-07': 40596.51, '2017-08': 52111.75, '2017-09': 49711.18, | ||||
|     #              '2017-10': 45766.09, | ||||
|     #              '2017-11': 45273.82, '2017-12': 22469.57, | ||||
|     #              '2018-01': 71032.23, '2018-02': 37874.38, '2018-03': 44312.24, '2018-04': 39742.02, | ||||
|     #              '2018-05': 43118.38, | ||||
|     #              '2018-06': 33859.69, '2018-07': 38910.89, '2018-08': 39138.42, '2018-09': 37175.03, | ||||
|     #              '2018-10': 44159.96, | ||||
|     #              '2018-11': 46321.72, '2018-12': 22410.88, | ||||
|     #              '2019-01': 61241.94, '2019-02': 31698.6, '2019-03': 44170.62, '2019-04': 47627.13, '2019-05': 54407.37, | ||||
|     #              '2019-06': 50231.68, '2019-07': 61010.29, '2019-08': 59782.19, '2019-09': 57245.15, | ||||
|     #              '2019-10': 61162.55, | ||||
|     #              '2019-11': 52398.25, '2019-12': 15482.64, | ||||
|     #              '2020-01': 38383.97, '2020-02': 26943.55, '2020-03': 57200.32, '2020-04': 49449.95, | ||||
|     #              '2020-05': 47009.84, | ||||
|     #              '2020-06': 49946.25, '2020-07': 56383.23, '2020-08': 60651.07} | ||||
|     # data = {'time_data': list(need_data.keys()), 'deal_data': list(need_data.values())} | ||||
|     # df = pd.DataFrame(data) | ||||
|     # df.set_index(['time_data'], inplace=True)  # 设置索引 | ||||
|     # cal_time_series(df, 7) # 模型调用 | ||||
|     # 数据测试2(从excel中读取): | ||||
|     path = r'D:\code\PriceForecast-svn\yuanyoudataset\指标数据.csv' | ||||
|     y = 'Brent活跃合约' | ||||
|     df = pd.read_csv(path) | ||||
|     df.rename(columns={f'{y}': 'deal_data'}, inplace=True) | ||||
|     df = df[['ds', f'{y}']] | ||||
|     print(df.tail()) | ||||
|     df.set_index(['ds'], inplace=True)  # 设置索引 | ||||
|     cal_time_series(df, 7) # 模型调用 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user