线上数据维护

This commit is contained in:
workpc 2025-07-09 17:52:29 +08:00
parent 0ca7553951
commit 34770b83a3
5 changed files with 1037 additions and 1194 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,29 @@
from statsmodels.tools.eval_measures import mse, rmse
from pandas import Series, DataFrame
import cufflinks as cf
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from xgboost import plot_importance, plot_tree
import xgboost as xgb
import plotly.graph_objects as go
import plotly.express as px
import statsmodels.api as sm
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
import sklearn.datasets as datasets
from sklearn import preprocessing
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import __version__
import random
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests import requests
import json import json
import xlrd import xlrd
@ -38,69 +64,29 @@ read_file_path_name = "纯苯数据项.xls"
one_cols = [] one_cols = []
two_cols = [] two_cols = []
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import random
import time
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn import preprocessing
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
# 导入机器学习算法模型 # 导入机器学习算法模型
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor
import statsmodels.api as sm
try: try:
from keras.preprocessing.sequence import TimeseriesGenerator from keras.preprocessing.sequence import TimeseriesGenerator
except: except:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import plotly.express as px
import plotly.graph_objects as go
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse,rmse
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import warnings
import pickle
from sklearn.metrics import mean_squared_error
# 切割训练数据和样本数据 # 切割训练数据和样本数据
from sklearn.model_selection import train_test_split
# 用于模型评分 # 用于模型评分
from sklearn.metrics import r2_score
le = preprocessing.LabelEncoder() le = preprocessing.LabelEncoder()
# print(__version__) # requires version >= 1.9.0 # print(__version__) # requires version >= 1.9.0
import cufflinks as cf
cf.go_offline() cf.go_offline()
random.seed(100) random.seed(100)
# 数据获取 # 数据获取
def get_head_auth(): def get_head_auth():
@ -124,7 +110,8 @@ def get_data_value(token, dataItemNoList,date):
"funcOperation": "查询" "funcOperation": "查询"
} }
headers = {"Authorization": token} headers = {"Authorization": token}
search_res = requests.post(url=search_url, headers=headers, json=search_data, timeout=(3, 5)) search_res = requests.post(
url=search_url, headers=headers, json=search_data, timeout=(3, 5))
search_value = json.loads(search_res.text)["data"] search_value = json.loads(search_res.text)["data"]
if search_value: if search_value:
return search_value return search_value
@ -136,9 +123,6 @@ def get_data_value(token, dataItemNoList,date):
# xls文件处理 # xls文件处理
def write_xls(data, date): def write_xls(data, date):
# 创建一个Workbook对象 # 创建一个Workbook对象
workbook = xlwt.Workbook() workbook = xlwt.Workbook()
@ -155,7 +139,6 @@ def write_xls(data,date):
workbook.save(get_cur_time(date)[0] + '.xls') workbook.save(get_cur_time(date)[0] + '.xls')
def getNow(date='', offset=0): def getNow(date='', offset=0):
"""生成指定日期的两种格式字符串 """生成指定日期的两种格式字符串
Args: Args:
@ -215,7 +198,8 @@ def get_cur_time(date=''):
def get_head_push_auth(): def get_head_push_auth():
login_res = requests.post(url=login_push_url, json=login_push_data, timeout=(3, 5)) login_res = requests.post(
url=login_push_url, json=login_push_data, timeout=(3, 5))
text = json.loads(login_res.text) text = json.loads(login_res.text)
if text["status"]: if text["status"]:
token = text["data"]["accessToken"] token = text["data"]["accessToken"]
@ -225,7 +209,6 @@ def get_head_push_auth():
return None return None
def upload_data_to_system(token_push, date): def upload_data_to_system(token_push, date):
datavalue = forecast_price() datavalue = forecast_price()
data = { data = {
@ -242,18 +225,18 @@ def upload_data_to_system(token_push,date):
} }
print(data) print(data)
headers = {"Authorization": token_push} headers = {"Authorization": token_push}
res = requests.post(url=upload_url, headers=headers, json=data, timeout=(3, 5)) res = requests.post(url=upload_url, headers=headers,
json=data, timeout=(3, 5))
print(res.text) print(res.text)
def forecast_price(): def forecast_price():
# df_test = pd.read_csv('定价模型数据收集0212.csv') # df_test = pd.read_csv('定价模型数据收集0212.csv')
df_test = pd.read_excel('纯苯数据项.xls', sheet_name='Sheet1') df_test = pd.read_excel('纯苯数据项.xls', sheet_name='Sheet1')
df_test.drop([0], inplace=True) df_test.drop([0], inplace=True)
# df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True) # df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True)
df_test['Date']=pd.to_datetime(df_test['Date'], format=r'%Y-%m-%d',infer_datetime_format=True) df_test['Date'] = pd.to_datetime(
df_test['Date'], format=r'%Y-%m-%d', infer_datetime_format=True)
df_test_1 = df_test df_test_1 = df_test
df_test_1 = df_test_1.fillna(df_test.ffill()) df_test_1 = df_test_1.fillna(df_test.ffill())
@ -262,7 +245,6 @@ def forecast_price():
# 选择用于模型训练的列名称 # 选择用于模型训练的列名称
col_for_training = df_test_1.columns col_for_training = df_test_1.columns
import joblib import joblib
Best_model_DalyLGPrice = joblib.load("日度价格预测_最佳模型.pkl") Best_model_DalyLGPrice = joblib.load("日度价格预测_最佳模型.pkl")
# 最新的一天为最后一行的数据 # 最新的一天为最后一行的数据
@ -286,6 +268,8 @@ def forecast_price():
a = float(a) a = float(a)
a = round(a, 2) a = round(a, 2)
return a return a
def optimize_Model(): def optimize_Model():
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
@ -299,8 +283,8 @@ def optimize_Model():
df_test = pd.read_excel('纯苯数据项.xls') df_test = pd.read_excel('纯苯数据项.xls')
df_test.drop([0], inplace=True) df_test.drop([0], inplace=True)
# df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True) # df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True)
df_test['Date']=pd.to_datetime(df_test['Date'], format='%Y-%m-%d',infer_datetime_format=True) df_test['Date'] = pd.to_datetime(
df_test['Date'], format='%Y-%m-%d', infer_datetime_format=True)
# 将缺失值补为前一个或者后一个数值 # 将缺失值补为前一个或者后一个数值
df_test_1 = df_test df_test_1 = df_test
@ -311,7 +295,6 @@ def optimize_Model():
df_test_1 = df_test_1.drop(["Date"], axis=1) df_test_1 = df_test_1.drop(["Date"], axis=1)
df_test_1 = df_test_1.astype('float') df_test_1 = df_test_1.astype('float')
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas import Series, DataFrame from pandas import Series, DataFrame
@ -360,7 +343,8 @@ def optimize_Model():
target = y target = y
# 切割数据样本集合测试集 # 切割数据样本集合测试集
X_train,x_test,y_train,y_true = train_test_split(train,target,test_size=0.2,random_state=0) X_train, x_test, y_train, y_true = train_test_split(
train, target, test_size=0.2, random_state=0)
# 模型缩写 # 模型缩写
Lasso = Lasso(random_state=0) Lasso = Lasso(random_state=0)
@ -391,14 +375,17 @@ def optimize_Model():
model_results1 model_results1
# 定义plot_feature_importance函数该函数用于计算特征重要性。此部分代码无需调整 # 定义plot_feature_importance函数该函数用于计算特征重要性。此部分代码无需调整
def plot_feature_importance(importance, names, model_type): def plot_feature_importance(importance, names, model_type):
feature_importance = np.array(importance) feature_importance = np.array(importance)
feature_names = np.array(names) feature_names = np.array(names)
data={'feature_names':feature_names,'feature_importance':feature_importance} data = {'feature_names': feature_names,
'feature_importance': feature_importance}
fi_df = pd.DataFrame(data) fi_df = pd.DataFrame(data)
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True) fi_df.sort_values(by=['feature_importance'],
ascending=False, inplace=True)
plt.figure(figsize=(10, 8)) plt.figure(figsize=(10, 8))
sn.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names']) sn.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
@ -408,7 +395,7 @@ def optimize_Model():
plt.ylabel('FEATURE NAMES') plt.ylabel('FEATURE NAMES')
from pylab import mpl from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['font.sans-serif'] = ['SimHei']
## Xgboost 模型参数优化-初步 # Xgboost 模型参数优化-初步
# 参考: https://juejin.im/post/6844903661013827598 # 参考: https://juejin.im/post/6844903661013827598
# 每次调参时备选参数数值以同数量级的1、3、10设置即可比如设置1、3、10或0.1、0.3、1.0或0.01,0.03,0.10即可) # 每次调参时备选参数数值以同数量级的1、3、10设置即可比如设置1、3、10或0.1、0.3、1.0或0.01,0.03,0.10即可)
@ -450,8 +437,6 @@ def optimize_Model():
# results = model_results1.append(model_results2, ignore_index = False) # results = model_results1.append(model_results2, ignore_index = False)
results = pd.concat([model_results1, model_results2], ignore_index=True) results = pd.concat([model_results1, model_results2], ignore_index=True)
import pickle import pickle
Pkl_Filename = "日度价格预测_最佳模型.pkl" Pkl_Filename = "日度价格预测_最佳模型.pkl"
@ -460,8 +445,6 @@ def optimize_Model():
pickle.dump(grid_search_XGB, file) pickle.dump(grid_search_XGB, file)
def read_xls_data(): def read_xls_data():
global one_cols, two_cols global one_cols, two_cols
# 打开 XLS 文件 # 打开 XLS 文件
@ -492,8 +475,6 @@ def read_xls_data():
# workbook.close() # workbook.close()
def start(date=None, token=None, token_push=None): def start(date=None, token=None, token_push=None):
read_xls_data() read_xls_data()
if date == None: if date == None:
@ -516,7 +497,8 @@ def start(date=None,token=None,token_push=None):
print(data_value) print(data_value)
dataItemNo_dataValue[data_value["dataItemNo"]] = "" dataItemNo_dataValue[data_value["dataItemNo"]] = ""
else: else:
dataItemNo_dataValue[data_value["dataItemNo"]] = data_value["dataValue"] dataItemNo_dataValue[data_value["dataItemNo"]
] = data_value["dataValue"]
for value in one_cols: for value in one_cols:
if value in dataItemNo_dataValue: if value in dataItemNo_dataValue:
@ -526,7 +508,7 @@ def start(date=None,token=None,token_push=None):
save_xls(append_rows) save_xls(append_rows)
# 获取当月的数据写入到指定文件,如果是补充数据,不需要执行 # 获取当月的数据写入到指定文件,如果是补充数据,不需要执行
queryDataListItemNos() queryDataListItemNos(date=date)
# 模型训练 # 模型训练
optimize_Model() optimize_Model()
# 上传预测数据 # 上传预测数据
@ -543,7 +525,6 @@ def start_1(date=None):
if not token: if not token:
return return
datas = get_data_value(token, one_cols, date=date) datas = get_data_value(token, one_cols, date=date)
# if not datas: # if not datas:
# return # return
@ -556,7 +537,8 @@ def start_1(date=None):
print(data_value) print(data_value)
dataItemNo_dataValue[data_value["dataItemNo"]] = "" dataItemNo_dataValue[data_value["dataItemNo"]] = ""
else: else:
dataItemNo_dataValue[data_value["dataItemNo"]] = data_value["dataValue"] dataItemNo_dataValue[data_value["dataItemNo"]
] = data_value["dataValue"]
for value in one_cols: for value in one_cols:
if value in dataItemNo_dataValue: if value in dataItemNo_dataValue:
@ -565,10 +547,10 @@ def start_1(date=None):
append_rows.append("") append_rows.append("")
save_xls_1(append_rows) save_xls_1(append_rows)
# data_list.append(three_cols) # data_list.append(three_cols)
# write_xls(data_list) # write_xls(data_list)
def save_xls_1(append_rows): def save_xls_1(append_rows):
# 打开xls文件 # 打开xls文件
@ -613,8 +595,6 @@ def save_xls_1(append_rows):
new_workbook.save("纯苯数据项.xls") new_workbook.save("纯苯数据项.xls")
def check_data(dataItemNo): def check_data(dataItemNo):
token = get_head_auth() token = get_head_auth()
if not token: if not token:
@ -669,8 +649,6 @@ def save_xls(append_rows):
new_workbook.save("纯苯数据项.xls") new_workbook.save("纯苯数据项.xls")
def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEnd): def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEnd):
search_data = { search_data = {
@ -684,7 +662,8 @@ def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEn
} }
headers = {"Authorization": token} headers = {"Authorization": token}
search_res = requests.post(url=url, headers=headers, json=search_data, timeout=(3, 5)) search_res = requests.post(
url=url, headers=headers, json=search_data, timeout=(3, 5))
search_value = json.loads(search_res.text)["data"] search_value = json.loads(search_res.text)["data"]
if search_value: if search_value:
return search_value return search_value
@ -692,7 +671,6 @@ def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEn
return None return None
def save_queryDataListItemNos_xls(data_df, dataItemNoList): def save_queryDataListItemNos_xls(data_df, dataItemNoList):
from datetime import datetime, timedelta from datetime import datetime, timedelta
current_year_month = datetime.now().strftime('%Y-%m') current_year_month = datetime.now().strftime('%Y-%m')
@ -727,7 +705,6 @@ def save_queryDataListItemNos_xls(data_df,dataItemNoList):
# 创建sheet # 创建sheet
new_sheet = new_workbook.add_sheet(sheet_names[i]) new_sheet = new_workbook.add_sheet(sheet_names[i])
current_year_month_row = 0 current_year_month_row = 0
# 将原有的数据写入新的sheet # 将原有的数据写入新的sheet
for row in range(row_count): for row in range(row_count):
@ -739,28 +716,27 @@ def save_queryDataListItemNos_xls(data_df,dataItemNoList):
break break
new_sheet.write(row, col, data[row][col]) new_sheet.write(row, col, data[row][col])
# print("current_year_month_row",current_year_month_row) # print("current_year_month_row",current_year_month_row)
if i == 0: if i == 0:
rowFlag = 0 rowFlag = 0
# 查看每组数据 # 查看每组数据
for date, group in grouped: for date, group in grouped:
new_sheet.write(row_count + rowFlag - current_year_month_row, 0, date) new_sheet.write(row_count + rowFlag -
current_year_month_row, 0, date)
for j in range(len(dataItemNoList)): for j in range(len(dataItemNoList)):
dataItemNo = dataItemNoList[j] dataItemNo = dataItemNoList[j]
if group[group["dataItemNo"] == dataItemNo]["dataValue"].values and (not str(group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0]) == 'nan'): if group[group["dataItemNo"] == dataItemNo]["dataValue"].values and (not str(group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0]) == 'nan'):
new_sheet.write(row_count + rowFlag - current_year_month_row, j + 1, group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0]) new_sheet.write(row_count + rowFlag - current_year_month_row, j + 1,
group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0])
rowFlag += 1 rowFlag += 1
# 保存新的xls文件 # 保存新的xls文件
new_workbook.save("纯苯数据项.xls") new_workbook.save("纯苯数据项.xls")
def queryDataListItemNos(date=None, token=None): def queryDataListItemNos(date=None, token=None):
from datetime import datetime, timedelta from datetime import datetime, timedelta
df = pd.read_excel('纯苯数据项.xls') df = pd.read_excel('纯苯数据项.xls')
@ -779,8 +755,10 @@ def queryDataListItemNos(date=None,token=None):
first_day_of_month = current_date.replace(day=1) first_day_of_month = current_date.replace(day=1)
# 格式化为 YYYYMMDD 格式 # 格式化为 YYYYMMDD 格式
dateEnd = current_date.strftime('%Y%m%d') dateEnd = current_date.strftime('%Y%m%d')
# dateEnd = date.strftime('%Y%m%d')
dateStart = first_day_of_month.strftime('%Y%m%d') dateStart = first_day_of_month.strftime('%Y%m%d')
search_value = get_queryDataListItemNos_value(token, queryDataListItemNos_url, dataItemNoList, dateStart, dateEnd) search_value = get_queryDataListItemNos_value(
token, queryDataListItemNos_url, dataItemNoList, dateStart, dateEnd)
data_df = pd.DataFrame(search_value) data_df = pd.DataFrame(search_value)
data_df["dataDate"] = pd.to_datetime(data_df["dataDate"]) data_df["dataDate"] = pd.to_datetime(data_df["dataDate"])
data_df["dataDate"] = data_df["dataDate"].dt.strftime('%Y-%m-%d') data_df["dataDate"] = data_df["dataDate"].dt.strftime('%Y-%m-%d')
@ -791,4 +769,15 @@ def queryDataListItemNos(date=None,token=None):
if __name__ == "__main__": if __name__ == "__main__":
print('运行中ing') print('运行中ing')
start() start()
# 自定义日期执行预测
# start_date = datetime(2025, 7, 6)
# end_date = datetime(2025, 7, 7)
# token = get_head_auth()
# token_push = get_head_push_auth()
# while start_date < end_date:
# print(start_date.strftime('%Y%m%d'))
# start(start_date, token, token_push)
# time.sleep(2)
# # start_1(start_date)
# start_date += timedelta(days=1)