633 lines
18 KiB
Python
633 lines
18 KiB
Python
import requests
|
||
import json
|
||
import xlrd
|
||
import xlwt
|
||
from datetime import datetime
|
||
import time
|
||
# 变量定义
|
||
login_url = "http://10.200.32.39/jingbo-api/api/server/login"
|
||
search_url = "http://10.200.32.39/jingbo-api/api/warehouse/dwDataItem/queryByItemNos"
|
||
|
||
login_push_url = "http://10.200.32.39/jingbo-api/api/server/login"
|
||
upload_url = "http://10.200.32.39/jingbo-api/api/dw/dataValue/pushDataValueList"
|
||
|
||
login_data = {
|
||
"data": {
|
||
"account": "api_dev",
|
||
"password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=",
|
||
"tenantHashCode": "8a4577dbd919675758d57999a1e891fe",
|
||
"terminal": "API"
|
||
},
|
||
"funcModule": "API",
|
||
"funcOperation": "获取token"
|
||
}
|
||
|
||
login_push_data = {
|
||
"data": {
|
||
"account": "api_dev",
|
||
"password": "ZTEwYWRjMzk0OWJhNTlhYmJlNTZlMDU3ZjIwZjg4M2U=",
|
||
"tenantHashCode": "8a4577dbd919675758d57999a1e891fe",
|
||
"terminal": "API"
|
||
},
|
||
"funcModule": "API",
|
||
"funcOperation": "获取token"
|
||
}
|
||
|
||
read_file_path_name = "丙烯基础数据收集表.xls"
|
||
one_cols = []
|
||
two_cols = []
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sn
|
||
import random
|
||
import time
|
||
|
||
|
||
|
||
|
||
from plotly import __version__
|
||
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
|
||
|
||
from sklearn import preprocessing
|
||
|
||
from pandas import Series,DataFrame
|
||
|
||
import matplotlib.pyplot as plt
|
||
|
||
import sklearn.datasets as datasets
|
||
|
||
#导入机器学习算法模型
|
||
from sklearn.linear_model import Lasso
|
||
from xgboost import XGBRegressor
|
||
|
||
import datetime
|
||
import statsmodels.api as sm
|
||
from keras.preprocessing.sequence import TimeseriesGenerator
|
||
|
||
import plotly.express as px
|
||
import plotly.graph_objects as go
|
||
|
||
import xgboost as xgb
|
||
from xgboost import plot_importance, plot_tree
|
||
from sklearn.metrics import mean_absolute_error
|
||
from statsmodels.tools.eval_measures import mse,rmse
|
||
from sklearn.model_selection import GridSearchCV
|
||
from xgboost import XGBRegressor
|
||
import warnings
|
||
import pickle
|
||
|
||
from sklearn.metrics import mean_squared_error
|
||
|
||
#切割训练数据和样本数据
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
#用于模型评分
|
||
from sklearn.metrics import r2_score
|
||
|
||
le = preprocessing.LabelEncoder()
|
||
|
||
# print(__version__) # requires version >= 1.9.0
|
||
|
||
|
||
import cufflinks as cf
|
||
cf.go_offline()
|
||
|
||
random.seed(100)
|
||
|
||
|
||
|
||
# 数据获取
|
||
|
||
def get_head_auth():
|
||
login_res = requests.post(url=login_url, json=login_data, timeout=(3, 5))
|
||
text = json.loads(login_res.text)
|
||
if text["status"]:
|
||
token = text["data"]["accessToken"]
|
||
return token
|
||
else:
|
||
print("获取认证失败")
|
||
return None
|
||
|
||
|
||
def get_data_value(token, dataItemNoList):
|
||
search_data = {
|
||
"data": {
|
||
"date": get_cur_time()[0],
|
||
"dataItemNoList": dataItemNoList
|
||
},
|
||
"funcModule": "数据项",
|
||
"funcOperation": "查询"
|
||
}
|
||
headers = {"Authorization": token}
|
||
search_res = requests.post(url=search_url, headers=headers, json=search_data, timeout=(3, 5))
|
||
search_value = json.loads(search_res.text)["data"]
|
||
if search_value:
|
||
return search_value
|
||
else:
|
||
print("今天没有新数据")
|
||
return search_value
|
||
|
||
|
||
# xls文件处理
|
||
|
||
def write_xls(data):
|
||
# 创建一个Workbook对象
|
||
workbook = xlwt.Workbook()
|
||
|
||
# 创建一个Sheet对象,可指定名称
|
||
sheet = workbook.load('Sheet1')
|
||
|
||
# 写入数据行
|
||
for row_index, row_data in enumerate(data):
|
||
for col_index, cell_data in enumerate(row_data):
|
||
sheet.write(row_index, col_index, cell_data)
|
||
|
||
# 保存Workbook到文件
|
||
workbook.save(get_cur_time()[0] + '.xls')
|
||
|
||
|
||
def get_cur_time():
|
||
now = datetime.datetime.now()
|
||
year = now.year
|
||
month = now.month
|
||
day = now.day
|
||
|
||
if month < 10:
|
||
month = "0" + str(month)
|
||
if day < 10:
|
||
day = "0" + str(day)
|
||
cur_time = str(year) + str(month) + str(day)
|
||
cur_time2 = str(year) + "-" + str(month) + "-" + str(day)
|
||
# cur_time = '20231011'
|
||
# cur_time2 = '2023-10-11'
|
||
return cur_time, cur_time2
|
||
|
||
|
||
def get_head_push_auth():
|
||
login_res = requests.post(url=login_push_url, json=login_push_data, timeout=(3, 5))
|
||
text = json.loads(login_res.text)
|
||
if text["status"]:
|
||
token = text["data"]["accessToken"]
|
||
return token
|
||
else:
|
||
print("获取认证失败")
|
||
return None
|
||
|
||
|
||
|
||
def upload_data_to_system(token_push):
|
||
data = {
|
||
"funcModule": "数据表信息列表",
|
||
"funcOperation": "新增",
|
||
"data": [
|
||
{"dataItemNo": "C01100007|Forecast_Price|ACN",
|
||
"dataDate": get_cur_time()[0],
|
||
"dataStatus": "add",
|
||
# "dataValue": 7100
|
||
"dataValue": forecast_price()
|
||
}
|
||
|
||
]
|
||
}
|
||
headers = {"Authorization": token_push}
|
||
res = requests.post(url=upload_url, headers=headers, json=data, timeout=(3, 5))
|
||
print(res.text)
|
||
|
||
|
||
# def upload_data_to_system(token):
|
||
# data = {
|
||
# "funcModule": "数据表信息列表",
|
||
# "funcOperation": "新增",
|
||
# "data": [
|
||
# {"dataItemNo": "C01100036|Forecast_ Price|ACN",
|
||
# "dataDate": '20230706',
|
||
# "dataStatus": "add",
|
||
# "dataValue": 3780.0
|
||
# }
|
||
|
||
# ]
|
||
# }
|
||
# headers = {"Authorization": token}
|
||
# res = requests.post(url=upload_url, headers=headers, json=data, timeout=(3, 5))
|
||
# print(res.text)
|
||
|
||
|
||
|
||
def forecast_price():
|
||
# df_test = pd.read_csv('定价模型数据收集0212.csv')
|
||
df_test = pd.read_excel('丙烯基础数据收集表.xls')
|
||
df_test.drop([0],inplace=True)
|
||
df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True)
|
||
|
||
#将缺失值补为前一个或者后一个数值
|
||
df_test_1 = df_test
|
||
df_test_1=df_test_1.fillna(df_test.ffill())
|
||
df_test_1=df_test_1.fillna(df_test_1.bfill())
|
||
|
||
# 选择用于模型训练的列名称
|
||
col_for_training = df_test_1.columns
|
||
|
||
|
||
|
||
|
||
|
||
import joblib
|
||
Best_model_DalyLGPrice = joblib.load("日度价格预测_丙烯最佳模型.pkl")
|
||
# 最新的一天为最后一行的数据
|
||
|
||
df_test_1_Day = df_test_1.tail(1)
|
||
# 移除不需要的列
|
||
df_test_1_Day.index = df_test_1_Day["Date"]
|
||
df_test_1_Day = df_test_1_Day.drop(["Date"], axis= 1)
|
||
df_test_1_Day=df_test_1_Day.drop('Price',axis=1)
|
||
df_test_1_Day=df_test_1_Day.dropna()
|
||
|
||
# df_test_1_Day
|
||
#预测今日价格,显示至小数点后两位
|
||
Ypredict_Today=Best_model_DalyLGPrice.predict(df_test_1_Day)
|
||
|
||
df_test_1_Day['日度预测价格']=Ypredict_Today
|
||
print(df_test_1_Day['日度预测价格'])
|
||
a = df_test_1_Day['日度预测价格']
|
||
a = a[0]
|
||
a = float(a)
|
||
a = round(a,2)
|
||
return a
|
||
def optimize_Model():
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.impute import SimpleImputer
|
||
from sklearn.preprocessing import OrdinalEncoder
|
||
from sklearn.feature_selection import SelectFromModel
|
||
from sklearn.metrics import mean_squared_error, r2_score
|
||
import pandas as pd
|
||
|
||
pd.set_option('display.max_rows',40)
|
||
pd.set_option('display.max_columns',40)
|
||
df_test = pd.read_excel('丙烯基础数据收集表.xls')
|
||
df_test.drop([0],inplace=True)
|
||
df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True)
|
||
#查看每个特征缺失值数量
|
||
MisVal_Check=df_test.isnull().sum().sort_values(ascending=False)
|
||
#去掉缺失值百分比>0.4的特征,去掉这些特征后的新表格命名为df_test_1
|
||
df_MisVal_Check = pd.DataFrame(MisVal_Check,)#
|
||
df_MisVal_Check_1=df_MisVal_Check.reset_index()
|
||
df_MisVal_Check_1.columns=['Variable_Name','Missing_Number']
|
||
df_MisVal_Check_1['Missing_Number']=df_MisVal_Check_1['Missing_Number']/len(df_test)
|
||
df_test_1=df_test.drop(df_MisVal_Check_1[df_MisVal_Check_1['Missing_Number']>0.4].Variable_Name,axis = 1)
|
||
#将缺失值补为前一个或者后一个数值
|
||
df_test_1 = df_test
|
||
df_test_1=df_test_1.fillna(df_test.ffill())
|
||
df_test_1=df_test_1.fillna(df_test_1.bfill())
|
||
df_test_1["Date"] = pd.to_datetime(df_test_1["Date"])
|
||
df_test_1.index = df_test_1["Date"]
|
||
df_test_1 = df_test_1.drop(["Date"], axis= 1)
|
||
df_test_1 = df_test_1.astype('float')
|
||
import numpy as np
|
||
import pandas as pd
|
||
from pandas import Series,DataFrame
|
||
|
||
import matplotlib.pyplot as plt
|
||
|
||
import sklearn.datasets as datasets
|
||
|
||
#导入机器学习算法模型
|
||
from sklearn.linear_model import Lasso
|
||
from xgboost import XGBRegressor
|
||
|
||
from datetime import datetime
|
||
import statsmodels.api as sm
|
||
from keras.preprocessing.sequence import TimeseriesGenerator
|
||
|
||
import plotly.express as px
|
||
import plotly.graph_objects as go
|
||
|
||
import xgboost as xgb
|
||
from xgboost import plot_importance, plot_tree
|
||
from sklearn.metrics import mean_absolute_error
|
||
from statsmodels.tools.eval_measures import mse,rmse
|
||
from sklearn.model_selection import GridSearchCV
|
||
from xgboost import XGBRegressor
|
||
import warnings
|
||
import pickle
|
||
|
||
from sklearn.metrics import mean_squared_error
|
||
|
||
#切割训练数据和样本数据
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
#用于模型评分
|
||
from sklearn.metrics import r2_score
|
||
|
||
dataset1=df_test_1.drop('Price',axis=1)#.astype(float)
|
||
|
||
y=df_test_1['Price']
|
||
|
||
x=dataset1
|
||
|
||
train = x
|
||
target = y
|
||
|
||
#切割数据样本集合测试集
|
||
X_train,x_test,y_train,y_true = train_test_split(train,target,test_size=0.2,random_state=0)
|
||
|
||
#模型缩写
|
||
Lasso = Lasso(random_state=0)
|
||
XGBR = XGBRegressor(random_state=0)
|
||
#训练模型
|
||
Lasso.fit(X_train,y_train)
|
||
XGBR.fit(X_train,y_train)
|
||
#模型拟合
|
||
y_pre_Lasso = Lasso.predict(x_test)
|
||
y_pre_XGBR = XGBR.predict(x_test)
|
||
|
||
#计算Lasso、XGBR、RandomForestR、AdaBoostR、GradientBoostingR、BaggingRegressor各模型的R²
|
||
Lasso_score = r2_score(y_true,y_pre_Lasso)
|
||
XGBR_score=r2_score(y_true,y_pre_XGBR)
|
||
|
||
#计算Lasso、XGBR的MSE和RMSE
|
||
Lasso_MSE=mean_squared_error(y_true, y_pre_Lasso)
|
||
XGBR_MSE=mean_squared_error(y_true, y_pre_XGBR)
|
||
|
||
Lasso_RMSE=np.sqrt(Lasso_MSE)
|
||
XGBR_RMSE=np.sqrt(XGBR_MSE)
|
||
# 将不同模型的不同误差值整合成一个表格
|
||
model_results = pd.DataFrame([['Lasso', Lasso_RMSE, Lasso_score],
|
||
['XgBoost', XGBR_RMSE, XGBR_score]],
|
||
columns = ['模型(Model)','均方根误差(RMSE)', 'R^2 score'])
|
||
#将模型名称(Model)列设置为索引
|
||
model_results1=model_results.set_index('模型(Model)')
|
||
|
||
model_results1
|
||
#定义plot_feature_importance函数,该函数用于计算特征重要性。此部分代码无需调整
|
||
def plot_feature_importance(importance,names,model_type):
|
||
feature_importance = np.array(importance)
|
||
feature_names = np.array(names)
|
||
|
||
data={'feature_names':feature_names,'feature_importance':feature_importance}
|
||
fi_df = pd.DataFrame(data)
|
||
|
||
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
|
||
|
||
plt.figure(figsize=(10,8))
|
||
sn.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
|
||
|
||
plt.title(model_type + " "+'FEATURE IMPORTANCE')
|
||
plt.xlabel('FEATURE IMPORTANCE')
|
||
plt.ylabel('FEATURE NAMES')
|
||
from pylab import mpl
|
||
mpl.rcParams['font.sans-serif'] = ['SimHei']
|
||
## Xgboost 模型参数优化-初步
|
||
#参考: https://juejin.im/post/6844903661013827598
|
||
#每次调参时,备选参数数值以同数量级的1、3、10设置即可(比如设置1、3、10,或0.1、0.3、1.0,或0.01,0.03,0.10即可)
|
||
|
||
from xgboost import XGBRegressor
|
||
from sklearn.model_selection import GridSearchCV
|
||
|
||
estimator = XGBRegressor(random_state=0,
|
||
nthread=4,
|
||
seed=0
|
||
)
|
||
parameters = {
|
||
'max_depth': range (2, 11, 2), # 树的最大深度
|
||
'n_estimators': range (50, 101, 10), # 迭代次数
|
||
'learning_rate': [0.01, 0.03, 0.1, 0.3, 0.5, 1]
|
||
}
|
||
|
||
grid_search_XGB = GridSearchCV(
|
||
estimator=estimator,
|
||
param_grid=parameters,
|
||
# n_jobs = 10,
|
||
cv = 3,
|
||
verbose=True
|
||
)
|
||
|
||
grid_search_XGB.fit(X_train, y_train)
|
||
#如果电脑在此步骤报错,可能是因为计算量太大,超过硬件可支持程度,可注释掉“n_jobs=10”一行
|
||
|
||
best_parameters = grid_search_XGB.best_estimator_.get_params()
|
||
y_pred = grid_search_XGB.predict(x_test)
|
||
|
||
op_XGBR_score = r2_score(y_true,y_pred)
|
||
op_XGBR_MSE= mean_squared_error(y_true, y_pred)
|
||
op_XGBR_RMSE= np.sqrt(op_XGBR_MSE)
|
||
|
||
model_results2 = pd.DataFrame([['Optimized_Xgboost', op_XGBR_RMSE, op_XGBR_score]],
|
||
columns = ['模型(Model)', '均方根误差(RMSE)', 'R^2 score'])
|
||
model_results2=model_results2.set_index('模型(Model)')
|
||
|
||
results = model_results1.append(model_results2, ignore_index = False)
|
||
import pickle
|
||
|
||
Pkl_Filename = "日度价格预测_丙烯最佳模型.pkl"
|
||
|
||
with open(Pkl_Filename, 'wb') as file:
|
||
pickle.dump(grid_search_XGB, file)
|
||
|
||
|
||
|
||
|
||
def read_xls_data():
|
||
global one_cols, two_cols
|
||
# 打开 XLS 文件
|
||
workbook = xlrd.open_workbook(read_file_path_name)
|
||
|
||
# 获取所有表格名称
|
||
# sheet_names = workbook.sheet_names()
|
||
|
||
# 选择第一个表格
|
||
sheet = workbook.sheet_by_index(0)
|
||
|
||
# 获取行数和列数
|
||
num_rows = sheet.nrows
|
||
# num_cols = sheet.ncols
|
||
|
||
# 遍历每一行,获取单元格数据
|
||
# for i in range(num_rows):
|
||
# row_data = sheet.row_values(i)
|
||
# one_cols.append(row_data)
|
||
# two_cols.append(row_data[1])
|
||
|
||
row_data = sheet.row_values(1)
|
||
one_cols = row_data
|
||
|
||
# 关闭 XLS 文件
|
||
# workbook.close()
|
||
|
||
|
||
|
||
|
||
def start():
|
||
read_xls_data()
|
||
|
||
token = get_head_auth()
|
||
if not token:
|
||
return
|
||
token_push = get_head_push_auth()
|
||
if not token_push:
|
||
return
|
||
|
||
datas = get_data_value(token, one_cols[1:])
|
||
# if not datas:
|
||
# return
|
||
|
||
# data_list = [two_cols, one_cols]
|
||
append_rows = [get_cur_time()[1]]
|
||
dataItemNo_dataValue = {}
|
||
for data_value in datas:
|
||
if "dataValue" not in data_value:
|
||
print(data_value)
|
||
dataItemNo_dataValue[data_value["dataItemNo"]] = ""
|
||
else:
|
||
dataItemNo_dataValue[data_value["dataItemNo"]] = data_value["dataValue"]
|
||
|
||
for value in one_cols[1:]:
|
||
if value in dataItemNo_dataValue:
|
||
append_rows.append(dataItemNo_dataValue[value])
|
||
else:
|
||
append_rows.append("")
|
||
save_xls(append_rows)
|
||
optimize_Model()
|
||
upload_data_to_system(token_push)
|
||
# data_list.append(three_cols)
|
||
# write_xls(data_list)
|
||
|
||
|
||
def start_1():
|
||
read_xls_data()
|
||
|
||
token = get_head_auth()
|
||
if not token:
|
||
return
|
||
|
||
|
||
datas = get_data_value(token, one_cols[1:])
|
||
# if not datas:
|
||
# return
|
||
|
||
# data_list = [two_cols, one_cols]
|
||
append_rows = [get_cur_time()[1]]
|
||
dataItemNo_dataValue = {}
|
||
for data_value in datas:
|
||
if "dataValue" not in data_value:
|
||
print(data_value)
|
||
dataItemNo_dataValue[data_value["dataItemNo"]] = ""
|
||
else:
|
||
dataItemNo_dataValue[data_value["dataItemNo"]] = data_value["dataValue"]
|
||
|
||
for value in one_cols[1:]:
|
||
if value in dataItemNo_dataValue:
|
||
append_rows.append(dataItemNo_dataValue[value])
|
||
else:
|
||
append_rows.append("")
|
||
save_xls_1(append_rows)
|
||
|
||
|
||
# data_list.append(three_cols)
|
||
# write_xls(data_list)
|
||
|
||
def save_xls_1(append_rows):
|
||
|
||
# 打开xls文件
|
||
workbook = xlrd.open_workbook('丙烯基础数据收集表.xls')
|
||
|
||
# 获取所有sheet的个数
|
||
sheet_count = len(workbook.sheet_names())
|
||
|
||
# 获取所有sheet的名称
|
||
sheet_names = workbook.sheet_names()
|
||
|
||
new_workbook = xlwt.Workbook()
|
||
for i in range(sheet_count):
|
||
# 获取当前sheet
|
||
sheet = workbook.sheet_by_index(i)
|
||
|
||
# 获取sheet的行数和列数
|
||
row_count = sheet.nrows - 1
|
||
col_count = sheet.ncols
|
||
# 获取原有数据
|
||
data = []
|
||
for row in range(row_count):
|
||
row_data = []
|
||
for col in range(col_count):
|
||
row_data.append(sheet.cell_value(row, col))
|
||
data.append(row_data)
|
||
# 创建xlwt的Workbook对象
|
||
# 创建sheet
|
||
new_sheet = new_workbook.add_sheet(sheet_names[i])
|
||
|
||
# 将原有的数据写入新的sheet
|
||
for row in range(row_count):
|
||
for col in range(col_count):
|
||
new_sheet.write(row, col, data[row][col])
|
||
|
||
if i == 0:
|
||
# 在新的sheet中添加数据
|
||
for col in range(col_count):
|
||
new_sheet.write(row_count, col, append_rows[col])
|
||
|
||
# 保存新的xls文件
|
||
new_workbook.save("丙烯基础数据收集表.xls")
|
||
|
||
|
||
|
||
|
||
def check_data(dataItemNo):
|
||
token = get_head_auth()
|
||
if not token:
|
||
return
|
||
|
||
datas = get_data_value(token, dataItemNo)
|
||
if not datas:
|
||
return
|
||
|
||
|
||
def save_xls(append_rows):
|
||
|
||
# 打开xls文件
|
||
workbook = xlrd.open_workbook('丙烯基础数据收集表.xls')
|
||
|
||
# 获取所有sheet的个数
|
||
sheet_count = len(workbook.sheet_names())
|
||
|
||
# 获取所有sheet的名称
|
||
sheet_names = workbook.sheet_names()
|
||
|
||
new_workbook = xlwt.Workbook()
|
||
for i in range(sheet_count):
|
||
# 获取当前sheet
|
||
sheet = workbook.sheet_by_index(i)
|
||
|
||
# 获取sheet的行数和列数
|
||
row_count = sheet.nrows
|
||
col_count = sheet.ncols
|
||
# 获取原有数据
|
||
data = []
|
||
for row in range(row_count):
|
||
row_data = []
|
||
for col in range(col_count):
|
||
row_data.append(sheet.cell_value(row, col))
|
||
data.append(row_data)
|
||
# 创建xlwt的Workbook对象
|
||
# 创建sheet
|
||
new_sheet = new_workbook.add_sheet(sheet_names[i])
|
||
|
||
# 将原有的数据写入新的sheet
|
||
for row in range(row_count):
|
||
for col in range(col_count):
|
||
new_sheet.write(row, col, data[row][col])
|
||
|
||
if i == 0:
|
||
# 在新的sheet中添加数据
|
||
for col in range(col_count):
|
||
new_sheet.write(row_count, col, append_rows[col])
|
||
|
||
# 保存新的xls文件
|
||
new_workbook.save("丙烯基础数据收集表.xls")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
start()
|
||
|