线上数据维护

This commit is contained in:
workpc 2025-07-09 17:52:29 +08:00
parent 0ca7553951
commit 34770b83a3
5 changed files with 1037 additions and 1194 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,29 @@
from statsmodels.tools.eval_measures import mse, rmse
from pandas import Series, DataFrame
import cufflinks as cf
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from xgboost import plot_importance, plot_tree
import xgboost as xgb
import plotly.graph_objects as go
import plotly.express as px
import statsmodels.api as sm
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
import sklearn.datasets as datasets
from sklearn import preprocessing
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import __version__
import random
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import json
import xlrd
@ -38,69 +64,29 @@ read_file_path_name = "纯苯数据项.xls"
one_cols = []
two_cols = []
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import random
import time
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn import preprocessing
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
# 导入机器学习算法模型
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor
import statsmodels.api as sm
try:
from keras.preprocessing.sequence import TimeseriesGenerator
except:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import plotly.express as px
import plotly.graph_objects as go
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse,rmse
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import warnings
import pickle
from sklearn.metrics import mean_squared_error
# 切割训练数据和样本数据
from sklearn.model_selection import train_test_split
# 用于模型评分
from sklearn.metrics import r2_score
le = preprocessing.LabelEncoder()
# print(__version__) # requires version >= 1.9.0
import cufflinks as cf
cf.go_offline()
random.seed(100)
# 数据获取
def get_head_auth():
@ -124,7 +110,8 @@ def get_data_value(token, dataItemNoList,date):
"funcOperation": "查询"
}
headers = {"Authorization": token}
search_res = requests.post(url=search_url, headers=headers, json=search_data, timeout=(3, 5))
search_res = requests.post(
url=search_url, headers=headers, json=search_data, timeout=(3, 5))
search_value = json.loads(search_res.text)["data"]
if search_value:
return search_value
@ -136,9 +123,6 @@ def get_data_value(token, dataItemNoList,date):
# xls文件处理
def write_xls(data, date):
# 创建一个Workbook对象
workbook = xlwt.Workbook()
@ -155,7 +139,6 @@ def write_xls(data,date):
workbook.save(get_cur_time(date)[0] + '.xls')
def getNow(date='', offset=0):
"""生成指定日期的两种格式字符串
Args:
@ -215,7 +198,8 @@ def get_cur_time(date=''):
def get_head_push_auth():
login_res = requests.post(url=login_push_url, json=login_push_data, timeout=(3, 5))
login_res = requests.post(
url=login_push_url, json=login_push_data, timeout=(3, 5))
text = json.loads(login_res.text)
if text["status"]:
token = text["data"]["accessToken"]
@ -225,7 +209,6 @@ def get_head_push_auth():
return None
def upload_data_to_system(token_push, date):
datavalue = forecast_price()
data = {
@ -242,18 +225,18 @@ def upload_data_to_system(token_push,date):
}
print(data)
headers = {"Authorization": token_push}
res = requests.post(url=upload_url, headers=headers, json=data, timeout=(3, 5))
res = requests.post(url=upload_url, headers=headers,
json=data, timeout=(3, 5))
print(res.text)
def forecast_price():
# df_test = pd.read_csv('定价模型数据收集0212.csv')
df_test = pd.read_excel('纯苯数据项.xls', sheet_name='Sheet1')
df_test.drop([0], inplace=True)
# df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True)
df_test['Date']=pd.to_datetime(df_test['Date'], format=r'%Y-%m-%d',infer_datetime_format=True)
df_test['Date'] = pd.to_datetime(
df_test['Date'], format=r'%Y-%m-%d', infer_datetime_format=True)
df_test_1 = df_test
df_test_1 = df_test_1.fillna(df_test.ffill())
@ -262,7 +245,6 @@ def forecast_price():
# 选择用于模型训练的列名称
col_for_training = df_test_1.columns
import joblib
Best_model_DalyLGPrice = joblib.load("日度价格预测_最佳模型.pkl")
# 最新的一天为最后一行的数据
@ -286,6 +268,8 @@ def forecast_price():
a = float(a)
a = round(a, 2)
return a
def optimize_Model():
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
@ -299,8 +283,8 @@ def optimize_Model():
df_test = pd.read_excel('纯苯数据项.xls')
df_test.drop([0], inplace=True)
# df_test['Date']=pd.to_datetime(df_test['Date'], format='%m/%d/%Y',infer_datetime_format=True)
df_test['Date']=pd.to_datetime(df_test['Date'], format='%Y-%m-%d',infer_datetime_format=True)
df_test['Date'] = pd.to_datetime(
df_test['Date'], format='%Y-%m-%d', infer_datetime_format=True)
# 将缺失值补为前一个或者后一个数值
df_test_1 = df_test
@ -311,7 +295,6 @@ def optimize_Model():
df_test_1 = df_test_1.drop(["Date"], axis=1)
df_test_1 = df_test_1.astype('float')
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
@ -360,7 +343,8 @@ def optimize_Model():
target = y
# 切割数据样本集合测试集
X_train,x_test,y_train,y_true = train_test_split(train,target,test_size=0.2,random_state=0)
X_train, x_test, y_train, y_true = train_test_split(
train, target, test_size=0.2, random_state=0)
# 模型缩写
Lasso = Lasso(random_state=0)
@ -391,14 +375,17 @@ def optimize_Model():
model_results1
# 定义plot_feature_importance函数该函数用于计算特征重要性。此部分代码无需调整
def plot_feature_importance(importance, names, model_type):
feature_importance = np.array(importance)
feature_names = np.array(names)
data={'feature_names':feature_names,'feature_importance':feature_importance}
data = {'feature_names': feature_names,
'feature_importance': feature_importance}
fi_df = pd.DataFrame(data)
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
fi_df.sort_values(by=['feature_importance'],
ascending=False, inplace=True)
plt.figure(figsize=(10, 8))
sn.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
@ -408,7 +395,7 @@ def optimize_Model():
plt.ylabel('FEATURE NAMES')
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
## Xgboost 模型参数优化-初步
# Xgboost 模型参数优化-初步
# 参考: https://juejin.im/post/6844903661013827598
# 每次调参时备选参数数值以同数量级的1、3、10设置即可比如设置1、3、10或0.1、0.3、1.0或0.01,0.03,0.10即可)
@ -450,8 +437,6 @@ def optimize_Model():
# results = model_results1.append(model_results2, ignore_index = False)
results = pd.concat([model_results1, model_results2], ignore_index=True)
import pickle
Pkl_Filename = "日度价格预测_最佳模型.pkl"
@ -460,8 +445,6 @@ def optimize_Model():
pickle.dump(grid_search_XGB, file)
def read_xls_data():
global one_cols, two_cols
# 打开 XLS 文件
@ -492,8 +475,6 @@ def read_xls_data():
# workbook.close()
def start(date=None, token=None, token_push=None):
read_xls_data()
if date == None:
@ -516,7 +497,8 @@ def start(date=None,token=None,token_push=None):
print(data_value)
dataItemNo_dataValue[data_value["dataItemNo"]] = ""
else:
dataItemNo_dataValue[data_value["dataItemNo"]] = data_value["dataValue"]
dataItemNo_dataValue[data_value["dataItemNo"]
] = data_value["dataValue"]
for value in one_cols:
if value in dataItemNo_dataValue:
@ -526,7 +508,7 @@ def start(date=None,token=None,token_push=None):
save_xls(append_rows)
# 获取当月的数据写入到指定文件,如果是补充数据,不需要执行
queryDataListItemNos()
queryDataListItemNos(date=date)
# 模型训练
optimize_Model()
# 上传预测数据
@ -543,7 +525,6 @@ def start_1(date=None):
if not token:
return
datas = get_data_value(token, one_cols, date=date)
# if not datas:
# return
@ -556,7 +537,8 @@ def start_1(date=None):
print(data_value)
dataItemNo_dataValue[data_value["dataItemNo"]] = ""
else:
dataItemNo_dataValue[data_value["dataItemNo"]] = data_value["dataValue"]
dataItemNo_dataValue[data_value["dataItemNo"]
] = data_value["dataValue"]
for value in one_cols:
if value in dataItemNo_dataValue:
@ -565,10 +547,10 @@ def start_1(date=None):
append_rows.append("")
save_xls_1(append_rows)
# data_list.append(three_cols)
# write_xls(data_list)
def save_xls_1(append_rows):
# 打开xls文件
@ -613,8 +595,6 @@ def save_xls_1(append_rows):
new_workbook.save("纯苯数据项.xls")
def check_data(dataItemNo):
token = get_head_auth()
if not token:
@ -669,8 +649,6 @@ def save_xls(append_rows):
new_workbook.save("纯苯数据项.xls")
def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEnd):
search_data = {
@ -684,7 +662,8 @@ def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEn
}
headers = {"Authorization": token}
search_res = requests.post(url=url, headers=headers, json=search_data, timeout=(3, 5))
search_res = requests.post(
url=url, headers=headers, json=search_data, timeout=(3, 5))
search_value = json.loads(search_res.text)["data"]
if search_value:
return search_value
@ -692,7 +671,6 @@ def get_queryDataListItemNos_value(token, url, dataItemNoList, dateStart, dateEn
return None
def save_queryDataListItemNos_xls(data_df, dataItemNoList):
from datetime import datetime, timedelta
current_year_month = datetime.now().strftime('%Y-%m')
@ -727,7 +705,6 @@ def save_queryDataListItemNos_xls(data_df,dataItemNoList):
# 创建sheet
new_sheet = new_workbook.add_sheet(sheet_names[i])
current_year_month_row = 0
# 将原有的数据写入新的sheet
for row in range(row_count):
@ -739,28 +716,27 @@ def save_queryDataListItemNos_xls(data_df,dataItemNoList):
break
new_sheet.write(row, col, data[row][col])
# print("current_year_month_row",current_year_month_row)
if i == 0:
rowFlag = 0
# 查看每组数据
for date, group in grouped:
new_sheet.write(row_count + rowFlag - current_year_month_row, 0, date)
new_sheet.write(row_count + rowFlag -
current_year_month_row, 0, date)
for j in range(len(dataItemNoList)):
dataItemNo = dataItemNoList[j]
if group[group["dataItemNo"] == dataItemNo]["dataValue"].values and (not str(group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0]) == 'nan'):
new_sheet.write(row_count + rowFlag - current_year_month_row, j + 1, group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0])
new_sheet.write(row_count + rowFlag - current_year_month_row, j + 1,
group[group["dataItemNo"] == dataItemNo]["dataValue"].values[0])
rowFlag += 1
# 保存新的xls文件
new_workbook.save("纯苯数据项.xls")
def queryDataListItemNos(date=None, token=None):
from datetime import datetime, timedelta
df = pd.read_excel('纯苯数据项.xls')
@ -779,8 +755,10 @@ def queryDataListItemNos(date=None,token=None):
first_day_of_month = current_date.replace(day=1)
# 格式化为 YYYYMMDD 格式
dateEnd = current_date.strftime('%Y%m%d')
# dateEnd = date.strftime('%Y%m%d')
dateStart = first_day_of_month.strftime('%Y%m%d')
search_value = get_queryDataListItemNos_value(token, queryDataListItemNos_url, dataItemNoList, dateStart, dateEnd)
search_value = get_queryDataListItemNos_value(
token, queryDataListItemNos_url, dataItemNoList, dateStart, dateEnd)
data_df = pd.DataFrame(search_value)
data_df["dataDate"] = pd.to_datetime(data_df["dataDate"])
data_df["dataDate"] = data_df["dataDate"].dt.strftime('%Y-%m-%d')
@ -791,4 +769,15 @@ def queryDataListItemNos(date=None,token=None):
if __name__ == "__main__":
print('运行中ing')
start()
# 自定义日期执行预测
# start_date = datetime(2025, 7, 6)
# end_date = datetime(2025, 7, 7)
# token = get_head_auth()
# token_push = get_head_push_auth()
# while start_date < end_date:
# print(start_date.strftime('%Y%m%d'))
# start(start_date, token, token_push)
# time.sleep(2)
# # start_1(start_date)
# start_date += timedelta(days=1)