from statsmodels.tools.eval_measures import mse, rmse from pandas import Series, DataFrame import cufflinks as cf from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import pickle import warnings from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_absolute_error from xgboost import plot_importance, plot_tree import xgboost as xgb import plotly.graph_objects as go import plotly.express as px import statsmodels.api as sm from xgboost import XGBRegressor from sklearn.linear_model import Lasso import sklearn.datasets as datasets from sklearn import preprocessing from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot from plotly import __version__ import random import seaborn as sn import matplotlib.pyplot as plt import numpy as np import pandas as pd import requests import json import xlrd import xlwt from datetime import datetime, timedelta import time # 变量定义 login_url = "http://10.200.32.39/jingbo-api/api/server/login" search_url = "http://10.200.32.39/jingbo-api/api/warehouse/dwDataItem/queryByItemNos" login_push_url = "http://10.200.32.39/jingbo-api/api/server/login" upload_url = "http://10.200.32.39/jingbo-api/api/dw/dataValue/pushDataValueList" queryDataListItemNos_url = "http://10.200.32.39/jingbo-api//api/warehouse/dwDataItem/queryDataListItemNos" login_data = { "data": { "account": "api_dev", "password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=", "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", "terminal": "API" }, "funcModule": "API", "funcOperation": "获取token" } login_push_data = { "data": { "account": "api_dev", "password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=", "tenantHashCode": "8a4577dbd919675758d57999a1e891fe", "terminal": "API" }, "funcModule": "API", "funcOperation": "获取token" } read_file_path_name = "纯苯数据项.xls" one_cols = [] two_cols = [] # 导入机器学习算法模型 try: from keras.preprocessing.sequence import TimeseriesGenerator except: from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator # 切割训练数据和样本数据 # 用于模型评分 le = preprocessing.LabelEncoder() # print(__version__) # requires version >= 1.9.0 cf.go_offline() random.seed(100) # 数据获取 def get_head_auth(): login_res = requests.post(url=login_url, json=login_data, timeout=(3, 5)) text = json.loads(login_res.text) if text["status"]: token = text["data"]["accessToken"] return token else: print("获取认证失败") return None def get_data_value(token, dataItemNoList, date): search_data = { "data": { "date": getNow(date)[0], "dataItemNoList": dataItemNoList }, "funcModule": "数据项", "funcOperation": "查询" } headers = {"Authorization": token} search_res = requests.post( url=search_url, headers=headers, json=search_data, timeout=(3, 5)) search_value = json.loads(search_res.text)["data"] if search_value: return search_value else: print("今天没有新数据") return search_value # xls文件处理 def write_xls(data, date): # 创建一个Workbook对象 workbook = xlwt.Workbook() # 创建一个Sheet对象,可指定名称 sheet = workbook.load('Sheet1') # 写入数据行 for row_index, row_data in enumerate(data): for col_index, cell_data in enumerate(row_data): sheet.write(row_index, col_index, cell_data) # 保存Workbook到文件 workbook.save(get_cur_time(date)[0] + '.xls') def getNow(date='', offset=0): """生成指定日期的两种格式字符串 Args: date: 支持多种输入类型: - datetime对象 - 字符串格式(支持'%Y-%m-%d'和'%Y%m%d') - 空字符串表示当前日期 offset: 日期偏移天数 Returns: tuple: (紧凑日期字符串, 标准日期字符串) """ # 日期解析逻辑 from datetime import datetime, timedelta if isinstance(date, datetime): now = date else: now = datetime.now() if date: # 尝试多种日期格式解析 for fmt in ('%Y-%m-%d', '%Y%m%d', '%Y/%m/%d'): try: now = datetime.strptime(str(date), fmt) break except ValueError: continue else: raise ValueError(f"无法解析的日期格式: {date}") # 应用日期偏移 now = now - timedelta(days=offset) # 统一格式化输出 date_str = now.strftime("%Y-%m-%d") compact_date = date_str.replace("-", "") return compact_date, date_str def get_cur_time(date=''): if date == '': now = datetime.now() # 如果是字符串,尝试解析日期 elif isinstance(date, str): now = datetime.strptime(date, '%Y-%m-%d') else: now = date year = now.year month = now.month day = now.day if month < 10: month = "0" + str(month) if day < 10: day = "0" + str(day) cur_time = str(year) + str(month) + str(day) cur_time2 = str(year) + "-" + str(month) + "-" + str(day) return cur_time, cur_time2 def get_head_push_auth(): login_res = requests.post( url=login_push_url, json=login_push_data, timeout=(3, 5)) text = json.loads(login_res.text) if text["status"]: token = text["data"]["accessToken"] return token else: print("获取认证失败") return None def upload_data_to_system(token_push, date): datavalue = forecast_price() data = { "funcModule": "数据表信息列表", "funcOperation": "新增", "data": [ {"dataItemNo": "C01100047|FORECAST_PRICE", "dataDate": getNow(date)[0], "dataStatus": "add", "dataValue": datavalue } ] } print(data) headers = {"Authorization": token_push} res = requests.post(url=upload_url, headers=headers, json=data, timeout=(3, 5)) print(res.text) def forecast_price(): # df_test = pd.read_csv('定价模型数据收集0212.csv') df_test = pd.read_excel('纯苯数据项.xls', sheet_name='Sheet1') df_test.drop([0], inplace=True) # df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True) df_test['Date'] = pd.to_datetime( df_test['Date'], format=r'%Y-%m-%d', infer_datetime_format=True) df_test_1 = df_test df_test_1 = df_test_1.fillna(df_test.ffill()) df_test_1 = df_test_1.fillna(df_test_1.bfill()) # 选择用于模型训练的列名称 col_for_training = df_test_1.columns import joblib Best_model_DalyLGPrice = joblib.load("日度价格预测_最佳模型.pkl") # 最新的一天为最后一行的数据 df_test_1_Day = df_test_1.tail(1) # 移除不需要的列 df_test_1_Day.index = df_test_1_Day["Date"] df_test_1_Day = df_test_1_Day.drop(["Date"], axis=1) df_test_1_Day = df_test_1_Day.drop('Price', axis=1) df_test_1_Day = df_test_1_Day.dropna() for col in df_test_1_Day.columns: df_test_1_Day[col] = pd.to_numeric(df_test_1_Day[col], errors='coerce') # 预测今日价格,显示至小数点后两位 Ypredict_Today = Best_model_DalyLGPrice.predict(df_test_1_Day) df_test_1_Day['日度预测价格'] = Ypredict_Today print(df_test_1_Day['日度预测价格']) a = df_test_1_Day['日度预测价格'] a = a[0] a = float(a) a = round(a, 2) return a def optimize_Model(): from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import OrdinalEncoder from sklearn.feature_selection import SelectFromModel from sklearn.metrics import mean_squared_error, r2_score import pandas as pd pd.set_option('display.max_rows', 40) pd.set_option('display.max_columns', 40) df_test = pd.read_excel('纯苯数据项.xls') df_test.drop([0], inplace=True) # df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True) df_test['Date'] = pd.to_datetime( df_test['Date'], format='%Y-%m-%d', infer_datetime_format=True) # 将缺失值补为前一个或者后一个数值 df_test_1 = df_test df_test_1 = df_test_1.fillna(df_test.ffill()) df_test_1 = df_test_1.fillna(df_test_1.bfill()) df_test_1["Date"] = pd.to_datetime(df_test_1["Date"]) df_test_1.index = df_test_1["Date"] df_test_1 = df_test_1.drop(["Date"], axis=1) df_test_1 = df_test_1.astype('float') import numpy as np import pandas as pd from pandas import Series, DataFrame import matplotlib.pyplot as plt import sklearn.datasets as datasets # 导入机器学习算法模型 from sklearn.linear_model import Lasso from xgboost import XGBRegressor import statsmodels.api as sm try: from keras.preprocessing.sequence import TimeseriesGenerator except: from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator import plotly.express as px import plotly.graph_objects as go import xgboost as xgb from xgboost import plot_importance, plot_tree from sklearn.metrics import mean_absolute_error from statsmodels.tools.eval_measures import mse, rmse from sklearn.model_selection import GridSearchCV from xgboost import XGBRegressor import warnings import pickle from sklearn.metrics import mean_squared_error # 切割训练数据和样本数据 from sklearn.model_selection import train_test_split # 用于模型评分 from sklearn.metrics import r2_score dataset1 = df_test_1.drop('Price', axis=1) # .astype(float) y = df_test_1['Price'] x = dataset1 train = x target = y # 切割数据样本集合测试集 X_train, x_test, y_train, y_true = train_test_split( train, target, test_size=0.2, random_state=0) # 模型缩写 Lasso = Lasso(random_state=0) XGBR = XGBRegressor(random_state=0) # 训练模型 Lasso.fit(X_train, y_train) XGBR.fit(X_train, y_train) # 模型拟合 y_pre_Lasso = Lasso.predict(x_test) y_pre_XGBR = XGBR.predict(x_test) # 计算Lasso、XGBR、RandomForestR、AdaBoostR、GradientBoostingR、BaggingRegressor各模型的R² Lasso_score = r2_score(y_true, y_pre_Lasso) XGBR_score = r2_score(y_true, y_pre_XGBR) # 计算Lasso、XGBR的MSE和RMSE Lasso_MSE = mean_squared_error(y_true, y_pre_Lasso) XGBR_MSE = mean_squared_error(y_true, y_pre_XGBR) Lasso_RMSE = np.sqrt(Lasso_MSE) XGBR_RMSE = np.sqrt(XGBR_MSE) # 将不同模型的不同误差值整合成一个表格 model_results = pd.DataFrame([['Lasso', Lasso_RMSE, Lasso_score], ['XgBoost', XGBR_RMSE, XGBR_score]], columns=['模型(Model)', '均方根误差(RMSE)', 'R^2 score']) # 将模型名称(Model)列设置为索引 model_results1 = model_results.set_index('模型(Model)') model_results1 # 定义plot_feature_importance函数,该函数用于计算特征重要性。此部分代码无需调整 def plot_feature_importance(importance, names, model_type): feature_importance = np.array(importance) feature_names = np.array(names) data = {'feature_names': feature_names, 'feature_importance': feature_importance} fi_df = pd.DataFrame(data) fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True) plt.figure(figsize=(10, 8)) sn.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names']) plt.title(model_type + " "+'FEATURE IMPORTANCE') plt.xlabel('FEATURE IMPORTANCE') plt.ylabel('FEATURE NAMES') from pylab import mpl mpl.rcParams['font.sans-serif'] = ['SimHei'] # Xgboost 模型参数优化-初步 # 参考: https://juejin.im/post/6844903661013827598 # 每次调参时,备选参数数值以同数量级的1、3、10设置即可(比如设置1、3、10,或0.1、0.3、1.0,或0.01,0.03,0.10即可) from xgboost import XGBRegressor from sklearn.model_selection import GridSearchCV estimator = XGBRegressor(random_state=0, nthread=4, seed=0 ) parameters = { 'max_depth': range(2, 11, 2), # 树的最大深度 'n_estimators': range(50, 101, 10), # 迭代次数 'learning_rate': [0.01, 0.03, 0.1, 0.3, 0.5, 1] } grid_search_XGB = GridSearchCV( estimator=estimator, param_grid=parameters, # n_jobs = 10, cv=3, verbose=True ) grid_search_XGB.fit(X_train, y_train) # 如果电脑在此步骤报错,可能是因为计算量太大,超过硬件可支持程度,可注释掉“n_jobs=10”一行 best_parameters = grid_search_XGB.best_estimator_.get_params() y_pred = grid_search_XGB.predict(x_test) op_XGBR_score = r2_score(y_true, y_pred) op_XGBR_MSE = mean_squared_error(y_true, y_pred) op_XGBR_RMSE = np.sqrt(op_XGBR_MSE) model_results2 = pd.DataFrame([['Optimized_Xgboost', op_XGBR_RMSE, op_XGBR_score]], columns=['模型(Model)', '均方根误差(RMSE)', 'R^2 score']) model_results2 = model_results2.set_index('模型(Model)') # results = model_results1.append(model_results2, ignore_index = False) results = pd.concat([model_results1, model_results2], ignore_index=True) import pickle Pkl_Filename = "日度价格预测_最佳模型.pkl" with open(Pkl_Filename, 'wb') as file: pickle.dump(grid_search_XGB, file) def read_xls_data(): global one_cols, two_cols # 打开 XLS 文件 workbook = xlrd.open_workbook(read_file_path_name) # 获取所有表格名称 # sheet_names = workbook.sheet_names() # 选择第一个表格 sheet = workbook.sheet_by_index(0) # 获取行数和列数 num_rows = sheet.nrows # num_cols = sheet.ncols # 遍历每一行,获取单元格数据 # for i in range(num_rows): # row_data = sheet.row_values(i) # one_cols.append(row_data) # two_cols.append(row_data[1]) row_data = sheet.row_values(1) print(f'获取到的数据项ID{row_data}') one_cols = row_data[1:] print(f'获取到的数据项ID{one_cols}') # 关闭 XLS 文件 # workbook.close() def start(date=None, token=None, token_push=None): read_xls_data() if date == None: date = getNow()[0] if token == None: token = get_head_auth() token_push = get_head_push_auth() datas = get_data_value(token, one_cols, date) if not datas: print("今天没有新数据") return # data_list = [two_cols, one_cols] append_rows = [getNow(date)[1]] # append_rows = [getNow()[1]] dataItemNo_dataValue = {} for data_value in datas: if "dataValue" not in data_value: print(data_value) dataItemNo_dataValue[data_value["dataItemNo"]] = "" else: dataItemNo_dataValue[data_value["dataItemNo"] ] = data_value["dataValue"] for value in one_cols: if value in dataItemNo_dataValue: append_rows.append(dataItemNo_dataValue[value]) else: append_rows.append("") save_xls(append_rows) # 获取当月的数据写入到指定文件,如果是补充数据,不需要执行 queryDataListItemNos(date=date) # 模型训练 optimize_Model() # 上传预测数据 upload_data_to_system(token_push, date) # data_list.append(three_cols) # write_xls(data_list) def start_1(date=None): read_xls_data() if date == None: date = getNow(offset=1)[0] token = get_head_auth() if not token: return datas = get_data_value(token, one_cols, date=date) # if not datas: # return # data_list = [two_cols, one_cols] append_rows = [getNow(offset=1)[1]] dataItemNo_dataValue = {} for data_value in datas: if "dataValue" not in data_value: print(data_value) dataItemNo_dataValue[data_value["dataItemNo"]] = "" else: dataItemNo_dataValue[data_value["dataItemNo"] ] = data_value["dataValue"] for value in one_cols: if value in dataItemNo_dataValue: append_rows.append(dataItemNo_dataValue[value]) else: append_rows.append("") save_xls_1(append_rows) # data_list.append(three_cols) # write_xls(data_list) def save_xls_1(append_rows): # 打开xls文件 workbook = xlrd.open_workbook('纯苯数据项.xls') # 获取所有sheet的个数 sheet_count = len(workbook.sheet_names()) # 获取所有sheet的名称 sheet_names = workbook.sheet_names() new_workbook = xlwt.Workbook() for i in range(sheet_count): # 获取当前sheet sheet = workbook.sheet_by_index(i) # 获取sheet的行数和列数 row_count = sheet.nrows - 1 col_count = sheet.ncols # 获取原有数据 data = [] for row in range(row_count): row_data = [] for col in range(col_count): row_data.append(sheet.cell_value(row, col)) data.append(row_data) # 创建xlwt的Workbook对象 # 创建sheet new_sheet = new_workbook.add_sheet(sheet_names[i]) # 将原有的数据写入新的sheet for row in range(row_count): for col in range(col_count): new_sheet.write(row, col, data[row][col]) if i == 0: # 在新的sheet中添加数据 for col in range(col_count): new_sheet.write(row_count, col, append_rows[col]) # 保存新的xls文件 new_workbook.save("纯苯数据项.xls") def check_data(dataItemNo): token = get_head_auth() if not token: return datas = get_data_value(token, dataItemNo) if not datas: return def save_xls(append_rows): # 打开xls文件 workbook = xlrd.open_workbook('纯苯数据项.xls') # 获取所有sheet的个数 sheet_count = len(workbook.sheet_names()) # 获取所有sheet的名称 sheet_names = workbook.sheet_names() new_workbook = xlwt.Workbook() for i in range(sheet_count): # 获取当前sheet sheet = workbook.sheet_by_index(i) # 获取sheet的行数和列数 row_count = sheet.nrows col_count = sheet.ncols # 获取原有数据 data = [] for row in range(row_count): row_data = [] for col in range(col_count): row_data.append(sheet.cell_value(row, col)) data.append(row_data) # 创建xlwt的Workbook对象 # 创建sheet new_sheet = new_workbook.add_sheet(sheet_names[i]) # 将原有的数据写入新的sheet for row in range(row_count): for col in range(col_count): new_sheet.write(row, col, data[row][col]) if i == 0: # 在新的sheet中添加数据 for col in range(col_count): new_sheet.write(row_count, col, append_rows[col]) # 保存新的xls文件 new_workbook.save("纯苯数据项.xls") def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEnd): search_data = { "funcModule": "数据项", "funcOperation": "查询", "data": { "dateStart": dateStart, "dateEnd": dateEnd, "dataItemNoList": dataItemNoList # 数据项编码,代表 brent最低价和最高价 } } headers = {"Authorization": token} search_res = requests.post( url=url, headers=headers, json=search_data, timeout=(3, 5)) search_value = json.loads(search_res.text)["data"] if search_value: return search_value else: return None def save_queryDataListItemNos_xls(data_df, dataItemNoList): from datetime import datetime, timedelta current_year_month = datetime.now().strftime('%Y-%m') grouped = data_df.groupby("dataDate") # 打开xls文件 workbook = xlrd.open_workbook('纯苯数据项.xls') # 获取所有sheet的个数 sheet_count = len(workbook.sheet_names()) # 获取所有sheet的名称 sheet_names = workbook.sheet_names() new_workbook = xlwt.Workbook() for i in range(sheet_count): # 获取当前sheet sheet = workbook.sheet_by_index(i) # 获取sheet的行数和列数 row_count = sheet.nrows col_count = sheet.ncols # 获取原有数据 data = [] for row in range(row_count): row_data = [] for col in range(col_count): row_data.append(sheet.cell_value(row, col)) data.append(row_data) # 创建xlwt的Workbook对象 # 创建sheet new_sheet = new_workbook.add_sheet(sheet_names[i]) current_year_month_row = 0 # 将原有的数据写入新的sheet for row in range(row_count): for col in range(col_count): col0 = data[row][0] # print("col0",col0[:7]) if col0[:7] == current_year_month: current_year_month_row += 1 break new_sheet.write(row, col, data[row][col]) # print("current_year_month_row",current_year_month_row) if i == 0: rowFlag = 0 # 查看每组数据 for date, group in grouped: new_sheet.write(row_count + rowFlag - current_year_month_row, 0, date) for j in range(len(dataItemNoList)): dataItemNo = dataItemNoList[j] if group[group["dataItemNo"] == dataItemNo]["dataValue"].values and (not str(group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0]) == 'nan'): new_sheet.write(row_count + rowFlag - current_year_month_row, j + 1, group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0]) rowFlag += 1 # 保存新的xls文件 new_workbook.save("纯苯数据项.xls") def queryDataListItemNos(date=None, token=None): from datetime import datetime, timedelta df = pd.read_excel('纯苯数据项.xls') dataItemNoList = df.iloc[0].tolist()[1:] if token is None: token = get_head_auth() if not token: print('token获取失败') return # 获取当前日期 if date is None: current_date = datetime.now() else: current_date = date # 获取当月1日 first_day_of_month = current_date.replace(day=1) # 格式化为 YYYYMMDD 格式 dateEnd = current_date.strftime('%Y%m%d') # dateEnd = date.strftime('%Y%m%d') dateStart = first_day_of_month.strftime('%Y%m%d') search_value = get_queryDataListItemNos_value( token, queryDataListItemNos_url, dataItemNoList, dateStart, dateEnd) data_df = pd.DataFrame(search_value) data_df["dataDate"] = pd.to_datetime(data_df["dataDate"]) data_df["dataDate"] = data_df["dataDate"].dt.strftime('%Y-%m-%d') save_queryDataListItemNos_xls(data_df, dataItemNoList) print('当月数据更新完成') if __name__ == "__main__": print('运行中ing') start() # 自定义日期执行预测 # start_date = datetime(2025, 7, 6) # end_date = datetime(2025, 7, 7) # token = get_head_auth() # token_push = get_head_push_auth() # while start_date < end_date: # print(start_date.strftime('%Y%m%d')) # start(start_date, token, token_push) # time.sleep(2) # # start_1(start_date) # start_date += timedelta(days=1)