from statsmodels.tools.eval_measures import mse, rmse from pandas import Series, DataFrame import cufflinks as cf from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import pickle import warnings from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_absolute_error from xgboost import plot_importance, plot_tree import xgboost as xgb import plotly.graph_objects as go import plotly.express as px import statsmodels.api as sm from xgboost import XGBRegressor from sklearn.linear_model import Lasso import sklearn.datasets as datasets from sklearn import preprocessing from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot from plotly import __version__ import time import random import seaborn as sn import matplotlib.pyplot as plt import numpy as np import pandas as pd import requests import json from datetime import datetime, timedelta # 变量定义 login_url = "http://10.200.32.39/jingbo-api/api/server/login" search_url = "http://10.200.32.39/jingbo-api/api/warehouse/dwDataItem/queryByItemNos" queryDataListItemNos_url = "http://10.200.32.39/jingbo-api/api/warehouse/dwDataItem/queryDataListItemNos" login_push_url = "http://10.200.32.39/jingbo-api/api/server/login" upload_url = "http://10.200.32.39/jingbo-api/api/dw/dataValue/pushDataValueList" login_data = { "data": { "account": "api_dev", "password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=", "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", "terminal": "API" }, "funcModule": "API", "funcOperation": "获取token" } login_push_data = { "data": { "account": "api_dev", "password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=", "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", "terminal": "API" }, "funcModule": "API", "funcOperation": "获取token" } read_file_path_name = "液化气数据.xlsx" one_cols = [] two_cols = [] # 导入机器学习算法模型 try: from keras.preprocessing.sequence import TimeseriesGenerator except: from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator # 切割训练数据和样本数据 # 用于模型评分 le = preprocessing.LabelEncoder() # print(__version__) # requires version >= 1.9.0 cf.go_offline() random.seed(100) # 数据获取 def get_head_auth(): login_res = requests.post(url=login_url, json=login_data, timeout=(3, 5)) text = json.loads(login_res.text) if text["status"]: token = text["data"]["accessToken"] print('获取的token:', token) return token else: print("获取认证失败") return None def get_data_value(token, dataItemNoList, date): search_data = { "data": { "date": date, "dataItemNoList": dataItemNoList }, "funcModule": "数据项", "funcOperation": "查询" } headers = {"Authorization": token} search_res = requests.post( url=search_url, headers=headers, json=search_data, timeout=(3, 5)) print('数据项查询参数search_data:') print(search_data) print('数据项查询结果search_res:') print(search_res.text) try: search_value = json.loads(search_res.text)["data"] print("数据项查询结果:", search_value) except json.JSONDecodeError as e: print(f"Error decoding JSON: {e}") print("Response content:", search_res.text) return None if search_value: return search_value else: print("今天没有新数据") return search_value def get_head_push_auth(): login_res = requests.post( url=login_push_url, json=login_push_data, timeout=(3, 5)) text = json.loads(login_res.text) if text["status"]: token = text["data"]["accessToken"] return token else: print("获取认证失败") return None def upload_data_to_system(token_push, date): data = { "funcModule": "数据表信息列表", "funcOperation": "新增", "data": [ {"dataItemNo": "250855713|Forecast_Price|ACN", "dataDate": getNow(date=date)[0], "dataStatus": "add", "dataValue": forecast_price() } ] } headers = {"Authorization": token_push} res = requests.post(url=upload_url, headers=headers, json=data, timeout=(3, 5)) print(res.text) print('预测值:', data['data'][0]['dataValue']) price_list = [] def forecast_price(): # df_test = pd.read_csv('定价模型数据收集0212.csv') df_test = pd.read_excel('液化气数据.xlsx') df_test.drop([0], inplace=True) try: df_test['Date'] = pd.to_datetime( df_test['Date'], format='%m/%d/%Y', infer_datetime_format=True) except: df_test['Date'] = pd.to_datetime( df_test['Date'], format=r'%Y-%m-%d', infer_datetime_format=True) df_test_1 = df_test df_test_1 = df_test_1.fillna(df_test.ffill()) df_test_1 = df_test_1.fillna(df_test_1.bfill()) # 选择用于模型训练的列名称 col_for_training = df_test_1.columns import joblib Best_model_DalyLGPrice = joblib.load("日度价格预测_液化气最佳模型.pkl") # 最新的一天为最后一行的数据 df_test_1_Day = df_test_1.tail(1) # 移除不需要的列 df_test_1_Day.index = df_test_1_Day["Date"] df_test_1_Day = df_test_1_Day.drop(["Date"], axis=1) df_test_1_Day = df_test_1_Day.drop('Price', axis=1) df_test_1_Day = df_test_1_Day.dropna() for col in df_test_1_Day.columns: df_test_1_Day[col] = pd.to_numeric(df_test_1_Day[col], errors='coerce') # 预测今日价格,显示至小数点后两位 Ypredict_Today = Best_model_DalyLGPrice.predict(df_test_1_Day) df_test_1_Day['日度预测价格'] = Ypredict_Today print(df_test_1_Day['日度预测价格']) a = df_test_1_Day['日度预测价格'] a = a[0] a = float(a) a = round(a, 2) price_list.append(a) return a def optimize_Model(): from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import OrdinalEncoder from sklearn.feature_selection import SelectFromModel from sklearn.metrics import mean_squared_error, r2_score import pandas as pd pd.set_option('display.max_rows', 40) pd.set_option('display.max_columns', 40) df_test = pd.read_excel('液化气数据.xlsx') df_test.drop([0], inplace=True) try: df_test['Date'] = pd.to_datetime( df_test['Date'], format='%m/%d/%Y', infer_datetime_format=True) except: df_test['Date'] = pd.to_datetime( df_test['Date'], format=r'%Y-%m-%d', infer_datetime_format=True) # 将缺失值补为前一个或者后一个数值 df_test_1 = df_test df_test_1 = df_test_1.fillna(df_test.ffill()) df_test_1 = df_test_1.fillna(df_test_1.bfill()) df_test_1["Date"] = pd.to_datetime(df_test_1["Date"]) df_test_1.index = df_test_1["Date"] df_test_1 = df_test_1.drop(["Date"], axis=1) df_test_1 = df_test_1.astype('float') import numpy as np import pandas as pd from pandas import Series, DataFrame import matplotlib.pyplot as plt import sklearn.datasets as datasets # 导入机器学习算法模型 from sklearn.linear_model import Lasso from xgboost import XGBRegressor import statsmodels.api as sm try: from keras.preprocessing.sequence import TimeseriesGenerator except: from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator import plotly.express as px import plotly.graph_objects as go import xgboost as xgb from xgboost import plot_importance, plot_tree from sklearn.metrics import mean_absolute_error from statsmodels.tools.eval_measures import mse, rmse from sklearn.model_selection import GridSearchCV from xgboost import XGBRegressor import warnings import pickle from sklearn.metrics import mean_squared_error # 切割训练数据和样本数据 from sklearn.model_selection import train_test_split # 用于模型评分 from sklearn.metrics import r2_score dataset1 = df_test_1.drop('Price', axis=1) # .astype(float) y = df_test_1['Price'] x = dataset1 train = x target = y # 切割数据样本集合测试集 X_train, x_test, y_train, y_true = train_test_split( train, target, test_size=0.2, random_state=0) # 模型缩写 Lasso = Lasso(random_state=0) XGBR = XGBRegressor(random_state=0) # 训练模型 Lasso.fit(X_train, y_train) XGBR.fit(X_train, y_train) # 模型拟合 y_pre_Lasso = Lasso.predict(x_test) y_pre_XGBR = XGBR.predict(x_test) # 计算Lasso、XGBR、RandomForestR、AdaBoostR、GradientBoostingR、BaggingRegressor各模型的R² Lasso_score = r2_score(y_true, y_pre_Lasso) XGBR_score = r2_score(y_true, y_pre_XGBR) # 计算Lasso、XGBR的MSE和RMSE Lasso_MSE = mean_squared_error(y_true, y_pre_Lasso) XGBR_MSE = mean_squared_error(y_true, y_pre_XGBR) Lasso_RMSE = np.sqrt(Lasso_MSE) XGBR_RMSE = np.sqrt(XGBR_MSE) # 将不同模型的不同误差值整合成一个表格 model_results = pd.DataFrame([['Lasso', Lasso_RMSE, Lasso_score], ['XgBoost', XGBR_RMSE, XGBR_score]], columns=['模型(Model)', '均方根误差(RMSE)', 'R^2 score']) # 将模型名称(Model)列设置为索引 model_results1 = model_results.set_index('模型(Model)') model_results1 # 定义plot_feature_importance函数,该函数用于计算特征重要性。此部分代码无需调整 def plot_feature_importance(importance, names, model_type): feature_importance = np.array(importance) feature_names = np.array(names) data = {'feature_names': feature_names, 'feature_importance': feature_importance} fi_df = pd.DataFrame(data) fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True) plt.figure(figsize=(10, 8)) sn.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names']) plt.title(model_type + " "+'FEATURE IMPORTANCE') plt.xlabel('FEATURE IMPORTANCE') plt.ylabel('FEATURE NAMES') from pylab import mpl mpl.rcParams['font.sans-serif'] = ['SimHei'] # Xgboost 模型参数优化-初步 # 参考: https://juejin.im/post/6844903661013827598 # 每次调参时,备选参数数值以同数量级的1、3、10设置即可(比如设置1、3、10,或0.1、0.3、1.0,或0.01,0.03,0.10即可) from xgboost import XGBRegressor from sklearn.model_selection import GridSearchCV estimator = XGBRegressor(random_state=0, nthread=4, seed=0 ) parameters = { 'max_depth': range(2, 11, 2), # 树的最大深度 'n_estimators': range(50, 101, 10), # 迭代次数 'learning_rate': [0.01, 0.03, 0.1, 0.3, 0.5, 1] } grid_search_XGB = GridSearchCV( estimator=estimator, param_grid=parameters, # n_jobs = 10, cv=3, verbose=True ) grid_search_XGB.fit(X_train, y_train) # 如果电脑在此步骤报错,可能是因为计算量太大,超过硬件可支持程度,可注释掉“n_jobs=10”一行 best_parameters = grid_search_XGB.best_estimator_.get_params() y_pred = grid_search_XGB.predict(x_test) op_XGBR_score = r2_score(y_true, y_pred) op_XGBR_MSE = mean_squared_error(y_true, y_pred) op_XGBR_RMSE = np.sqrt(op_XGBR_MSE) model_results2 = pd.DataFrame([['Optimized_Xgboost', op_XGBR_RMSE, op_XGBR_score]], columns=['模型(Model)', '均方根误差(RMSE)', 'R^2 score']) model_results2 = model_results2.set_index('模型(Model)') try: results = model_results1.append(model_results2, ignore_index=False) except: results = pd.concat( [model_results1, model_results2], ignore_index=True) import pickle Pkl_Filename = "日度价格预测_液化气最佳模型.pkl" with open(Pkl_Filename, 'wb') as file: pickle.dump(grid_search_XGB, file) def read_xls_data(): """获取特征项ID""" global one_cols, two_cols # 使用pandas读取Excel文件 df = pd.read_excel(read_file_path_name, header=None) # 不自动识别列名 # 获取第二行数据(索引为1) one_cols = df.iloc[1].tolist()[1:] print(f'获取到的数据项ID{one_cols}') def start(date=''): """获取当日数据""" read_xls_data() token = get_head_auth() if not token: return cur_time, cur_time2 = getNow(date) print(f"获取{cur_time}数据") datas = get_data_value(token, one_cols, date=cur_time) if not datas: return append_rows = [cur_time2] dataItemNo_dataValue = {} for data_value in datas: if "dataValue" not in data_value: print(data_value) dataItemNo_dataValue[data_value["dataItemNo"]] = "" else: dataItemNo_dataValue[data_value["dataItemNo"] ] = data_value["dataValue"] for value in one_cols: if value in dataItemNo_dataValue: append_rows.append(dataItemNo_dataValue[value]) else: append_rows.append("") print('添加的行:', append_rows) save_xls_2(append_rows) def getNow(date='', offset=0): """生成指定日期的两种格式字符串 Args: date: 支持多种输入类型: - datetime对象 - 字符串格式(支持'%Y-%m-%d'和'%Y%m%d') - 空字符串表示当前日期 offset: 日期偏移天数 Returns: tuple: (紧凑日期字符串, 标准日期字符串) """ # 日期解析逻辑 from datetime import datetime, timedelta if isinstance(date, datetime): now = date else: now = datetime.now() if date: # 尝试多种日期格式解析 for fmt in ('%Y-%m-%d', '%Y%m%d', '%Y/%m/%d'): try: now = datetime.strptime(str(date), fmt) break except ValueError: continue else: raise ValueError(f"无法解析的日期格式: {date}") # 应用日期偏移 now = now - timedelta(days=offset) # 统一格式化输出 date_str = now.strftime("%Y-%m-%d") compact_date = date_str.replace("-", "") return compact_date, date_str def start_1(date=''): """补充昨日数据""" read_xls_data() token = get_head_auth() if not token: return cur_time, cur_time2 = getNow(date, offset=1) print(f"补充{cur_time}数据") datas = get_data_value(token, one_cols, date=cur_time) if not datas: print(f"{cur_time}没有数据") return append_rows = [cur_time2] dataItemNo_dataValue = {} for data_value in datas: if "dataValue" not in data_value: print(data_value) dataItemNo_dataValue[data_value["dataItemNo"]] = "" else: dataItemNo_dataValue[data_value["dataItemNo"] ] = data_value["dataValue"] for value in one_cols: if value in dataItemNo_dataValue: append_rows.append(dataItemNo_dataValue[value]) else: append_rows.append("") print('添加的行:', append_rows) save_xls_2(append_rows) def save_xls_2(append_rows): """保存或更新数据到Excel文件 参数: append_rows (list): 需要追加/更新的数据行,格式为[日期, 数据项1, 数据项2,...] """ try: # 读取现有数据(假设第一行为列名) df = pd.read_excel('液化气数据.xlsx', sheet_name=0) # 转换append_rows为DataFrame append_rows = pd.DataFrame([append_rows], columns=df.columns) # 创建新数据行 new_date = append_rows['Date'].values[0] dates = df['Date'].to_list() # 判断日期是否存在 if new_date in dates: # 找到日期所在行的索引 date_mask = df['Date'] == new_date # 存在则更新数据 df.loc[date_mask] = append_rows.values print(f"更新 {new_date} 数据") else: # 不存在则追加数据 df = pd.concat([df, append_rows], ignore_index=True) print(df.head()) print(df.tail()) print(f"插入 {new_date} 新数据") # 保存更新后的数据 df.to_excel('液化气数据.xlsx', index=False, engine='openpyxl') except FileNotFoundError: # 如果文件不存在则创建新文件 pd.DataFrame([append_rows]).to_excel( '液化气数据.xlsx', index=False, engine='openpyxl') except Exception as e: print(f"保存数据时发生错误: {str(e)}") def check_data(dataItemNo): token = get_head_auth() if not token: return datas = get_data_value(token, dataItemNo) if not datas: return def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEnd): search_data = { "funcModule": "数据项", "funcOperation": "查询", "data": { "dateStart": dateStart, "dateEnd": dateEnd, "dataItemNoList": dataItemNoList # 数据项编码,代表 brent最低价和最高价 } } headers = {"Authorization": token} search_res = requests.post( url=url, headers=headers, json=search_data, timeout=(3, 5)) search_value = json.loads(search_res.text)["data"] if search_value: return search_value else: return None def save_queryDataListItemNos_xls(data_df, dataItemNoList): from datetime import datetime, timedelta current_year_month = datetime.now().strftime('%Y-%m') grouped = data_df.groupby("dataDate") # 使用openpyxl打开xlsx文件 from openpyxl import load_workbook workbook = load_workbook('液化气数据.xlsx') # 创建新工作簿 new_workbook = load_workbook('液化气数据.xlsx') for sheetname in workbook.sheetnames: sheet = workbook[sheetname] new_sheet = new_workbook[sheetname] current_year_month_row = 0 # 查找当前月份数据起始行 for row_idx, row in enumerate(sheet.iter_rows(values_only=True), 1): if str(row[0]).startswith(current_year_month): current_year_month_row += 1 # 追加新数据 if sheetname == workbook.sheetnames[0]: start_row = sheet.max_row - current_year_month_row + 1 for row_idx, (date, group) in enumerate(grouped, start=start_row): new_sheet.cell(row=row_idx, column=1, value=date) for j, dataItemNo in enumerate(dataItemNoList, start=2): if group[group["dataItemNo"] == dataItemNo]["dataValue"].values: new_sheet.cell(row=row_idx, column=j, value=group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0]) # 保存修改后的xlsx文件 new_workbook.save("液化气数据.xlsx") def queryDataListItemNos(date=None, token=None): df = pd.read_excel('液化气数据.xlsx') dataItemNoList = df.iloc[0].tolist()[1:] if token is None: token = get_head_auth() if not token: print('token获取失败') return # 获取当前日期 if date is None: current_date = datetime.now() else: current_date = date # 获取当月1日 first_day_of_month = current_date.replace(day=1) # 格式化为 YYYYMMDD 格式 dateEnd = current_date.strftime('%Y%m%d') dateStart = first_day_of_month.strftime('%Y%m%d') # dateStart = '20250604' search_value = get_queryDataListItemNos_value( token, queryDataListItemNos_url, dataItemNoList, dateStart, dateEnd) data_df = pd.DataFrame(search_value) data_df["dataDate"] = pd.to_datetime(data_df["dataDate"]) data_df["dataDate"] = data_df["dataDate"].dt.strftime('%Y-%m-%d') save_queryDataListItemNos_xls(data_df, dataItemNoList) print('当月数据更新完成') def main(start_date=None, token=None, token_push=None): from datetime import datetime, timedelta if start_date is None: start_date = datetime.now() if token is None: token = get_head_auth() if token_push is None: token_push = get_head_push_auth() date = start_date.strftime('%Y%m%d') print(date) try: # 更新当月数据 queryDataListItemNos(start_date, token) except: print('当月数据更新失败,单日更新') start(date) # 更新当日数据,批量日期更新时打开 # start(date) # 训练模型 optimize_Model() # 预测&上传预测结果 upload_data_to_system(token_push, start_date) # forecast_price() if __name__ == "__main__": print("运行中ing...") # 遍历2024-11-25 到 2024-12-3 之间的工作日日期 # for i_time in pd.date_range('2025-7-1', '2025-7-8', freq='D'): # # try: # print(i_time) # main(start_date=i_time) # except Exception as e: # continue main()