原油特征处理,删除手工数据
This commit is contained in:
parent
757bc5317a
commit
061d38a621
Binary file not shown.
Binary file not shown.
@ -20,8 +20,8 @@ plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
|
|||||||
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
|
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
|
||||||
|
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
# from config_jingbo import *
|
from config_jingbo import *
|
||||||
from config_juxiting import *
|
# from config_juxiting import *
|
||||||
from sklearn import metrics
|
from sklearn import metrics
|
||||||
from reportlab.pdfbase import pdfmetrics # 注册字体
|
from reportlab.pdfbase import pdfmetrics # 注册字体
|
||||||
from reportlab.pdfbase.ttfonts import TTFont # 字体类
|
from reportlab.pdfbase.ttfonts import TTFont # 字体类
|
||||||
@ -473,9 +473,8 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y'
|
|||||||
values = values + [column, last_update_time,0]
|
values = values + [column, last_update_time,0]
|
||||||
# 计算特征数据值的时间差
|
# 计算特征数据值的时间差
|
||||||
try:
|
try:
|
||||||
time_diff = (df1[column].dropna().index.to_series().diff().mode()[0]).total_seconds() / 3600 / 24
|
|
||||||
|
|
||||||
# 计算预警日期
|
# 计算预警日期
|
||||||
|
time_diff = (df1[column].dropna().index.to_series().diff().mode()[0]).total_seconds() / 3600 / 24
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
early_warning_date = datetime.datetime.strptime(last_update_time, '%Y-%m-%d') + timedelta(days=time_diff)*2
|
early_warning_date = datetime.datetime.strptime(last_update_time, '%Y-%m-%d') + timedelta(days=time_diff)*2
|
||||||
early_warning_date = early_warning_date.strftime('%Y-%m-%d')
|
early_warning_date = early_warning_date.strftime('%Y-%m-%d')
|
||||||
@ -492,9 +491,9 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y'
|
|||||||
|
|
||||||
|
|
||||||
logger.info(f'删除两月不更新特征前数据量:{df.shape}')
|
logger.info(f'删除两月不更新特征前数据量:{df.shape}')
|
||||||
# 去掉近最后数据对应的日期在两月以前的列,删除近2月的数据是常熟的列
|
# 去掉近最后数据对应的日期在六月以前的列,删除近2月的数据是常熟的列
|
||||||
current_date = datetime.datetime.now()
|
current_date = datetime.datetime.now()
|
||||||
two_months_ago = current_date - timedelta(days=40)
|
two_months_ago = current_date - timedelta(days=180)
|
||||||
|
|
||||||
def check_column(col_name):
|
def check_column(col_name):
|
||||||
'''
|
'''
|
||||||
@ -507,8 +506,11 @@ def datachuli(df_zhibiaoshuju,df_zhibiaoliebiao,datecol='date',end_time='',y='y'
|
|||||||
df_check_column = df[['ds',col_name]]
|
df_check_column = df[['ds',col_name]]
|
||||||
df_check_column = df_check_column.dropna()
|
df_check_column = df_check_column.dropna()
|
||||||
if len(df_check_column) == 0:
|
if len(df_check_column) == 0:
|
||||||
|
print(f'空值列:{col_name}')
|
||||||
return True
|
return True
|
||||||
|
# 判断是不是常数列
|
||||||
if df_check_column[(df_check_column['ds']>= two_months_ago)].groupby(col_name).ngroups < 2:
|
if df_check_column[(df_check_column['ds']>= two_months_ago)].groupby(col_name).ngroups < 2:
|
||||||
|
print(f'180没有更新:{col_name}')
|
||||||
return True
|
return True
|
||||||
corresponding_date = df_check_column.iloc[-1]['ds']
|
corresponding_date = df_check_column.iloc[-1]['ds']
|
||||||
return corresponding_date < two_months_ago
|
return corresponding_date < two_months_ago
|
||||||
@ -1181,7 +1183,7 @@ class EtaReader():
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
# 构建新的DataFrame df df1
|
# 构建新的DataFrame df df1
|
||||||
df = pd.DataFrame(columns=['指标分类', '指标名称', '指标id', '频度'])
|
df = pd.DataFrame(columns=['指标分类', '指标名称', '指标id', '频度','指标来源','来源id'])
|
||||||
df1 = pd.DataFrame(columns=['DataTime'])
|
df1 = pd.DataFrame(columns=['DataTime'])
|
||||||
|
|
||||||
|
|
||||||
@ -1225,16 +1227,27 @@ class EtaReader():
|
|||||||
EdbCode = i.get('EdbCode')
|
EdbCode = i.get('EdbCode')
|
||||||
EdbName = i.get('EdbName') # 指标名称,要保存到df2的指标名称列,df的指标名称列
|
EdbName = i.get('EdbName') # 指标名称,要保存到df2的指标名称列,df的指标名称列
|
||||||
Frequency = i.get('Frequency') # 频度,要保存到df的频度列
|
Frequency = i.get('Frequency') # 频度,要保存到df的频度列
|
||||||
|
SourceName = i.get('SourceName') # 来源名称,要保存到df的频度列
|
||||||
|
Source = i.get('Source') # 来源ID,要保存到df的频度列
|
||||||
# 频度不是 日 或者 周的 跳过
|
# 频度不是 日 或者 周的 跳过
|
||||||
if Frequency not in ['日度','周度','日','周']:
|
if Frequency not in ['日度','周度','日','周']:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# 只保留手工数据中,名称带有 海运出口 海运进口
|
||||||
|
if Source == 9 and not ('海运出口' in EdbName or '海运进口' in EdbName):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 不要wind数据
|
||||||
|
if Source == 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
# 判断名称是否需要保存
|
# 判断名称是否需要保存
|
||||||
isSave = self.filter_yuanyou_data(ClassifyName,EdbName)
|
isSave = self.filter_yuanyou_data(ClassifyName,EdbName)
|
||||||
if isSave:
|
if isSave:
|
||||||
# 保存到df
|
# 保存到df
|
||||||
# 保存频度 指标名称 分类 指标id 到 df
|
# 保存频度 指标名称 分类 指标id 到 df
|
||||||
df2 = pd.DataFrame({'指标分类': ClassifyName, '指标名称': EdbName, '指标id': EdbCode, '频度': Frequency},index=[0])
|
df2 = pd.DataFrame({'指标分类': ClassifyName, '指标名称': EdbName, '指标id': EdbCode, '频度': Frequency,'指标来源':SourceName,'来源id':Source},index=[0])
|
||||||
|
|
||||||
# df = pd.merge(df, df2, how='outer')
|
# df = pd.merge(df, df2, how='outer')
|
||||||
df = pd.concat([df, df2])
|
df = pd.concat([df, df2])
|
||||||
@ -1255,7 +1268,7 @@ class EtaReader():
|
|||||||
itemname = item
|
itemname = item
|
||||||
|
|
||||||
df1 = self.edbcodegetdata(df1,item,itemname)
|
df1 = self.edbcodegetdata(df1,item,itemname)
|
||||||
df = pd.concat([df, pd.DataFrame({'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他'},index=[0])])
|
df = pd.concat([df, pd.DataFrame({'指标分类': '其他', '指标名称': itemname, '指标id': item, '频度': '其他','指标来源':'其他','来源id':'其他'},index=[0])])
|
||||||
|
|
||||||
# 按时间排序
|
# 按时间排序
|
||||||
df1.sort_values('DataTime',inplace=True,ascending=False)
|
df1.sort_values('DataTime',inplace=True,ascending=False)
|
||||||
|
23378
logs/pricepredict.log.1
23378
logs/pricepredict.log.1
File diff suppressed because it is too large
Load Diff
12047
logs/pricepredict.log.2
12047
logs/pricepredict.log.2
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
11457
logs/pricepredict.log.4
11457
logs/pricepredict.log.4
File diff suppressed because one or more lines are too long
11888
logs/pricepredict.log.5
11888
logs/pricepredict.log.5
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user