1
比赛介绍
本次竞赛聚焦解决大规模生产系统中的硬盘故障预测问题,需解决数据噪声、正负样本不均衡等技术问题,同时也要保障预测算法长期稳定有效。
给定一段连续采集(天粒度)的 硬盘状态监控数据(Self-Monitoring, Analysis, and Reporting Technology; often written as SMART)以及故障标签数据,参赛者需要自己提出方案,按天粒度判断每块硬盘是否会在未来30日内发生故障。例如,可以将预测故障问题转化为传统的二分类问题,通过分类模型来判断哪些硬盘会坏;或者可以转化为排序问题,通过Learning to rank的方式判断硬盘的损坏严重程度等。
2
比赛baseline
# 因为题目数据量比较大,所以先对原始数据进行处理,方便以后调试
import numpy as np
import pandas as pd
import gc
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cbt
import math
from lightgbm.sklearn import LGBMClassifier
from collections import Counter
import time
from scipy.stats import kurtosis,iqr
from scipy import ptp
from tqdm import tqdm
import datetime
from sklearn.metrics import accuracy_score, roc_auc_score,log_loss,f1_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from datetime import datetime,timedelta
import warnings
import os
import joblib
pd.options.display.max_columns = None
pd.options.display.max_rows = None
test = pd.read_csv('./disk_sample_smart_log_test_a.csv/disk_sample_smart_log_test_a.csv')
tag = pd.read_csv('./disk_sample_fault_tag.csv/tag.csv')
test['dt'] = test['dt'].apply(lambda x:''.join(str(x)[0:4] +'-'+ str(x)[4:6] +'-'+ str(x)[6:]))
test['dt'] = pd.to_datetime(test['dt'])
tag['fault_time'] = pd.to_datetime(tag['fault_time'])
###tag表里面有的硬盘同一天发生几种故障
tag['tag'] = tag['tag'].astype(str)
tag = tag.groupby(['serial_number','fault_time','model'])['tag'].apply(lambda x :'|'.join(x)).reset_index()
test = test.sort_values(['serial_number','dt'])
test = test.drop_duplicates().reset_index(drop=True)
###去掉全为空值和nunique为1的特征
drop_list =[]
for i in tqdm([col for col in test.columns if col not in ['manufacturer','model']]):
if (test[i].nunique() == 1)&(test[i].isnull().sum() == 0):
drop_list.append(i)
df= pd.DataFrame()
df['fea'] = test.isnull().sum().index
df['isnull_sum'] = test.isnull().sum().values
fea_list = list(set(df.loc[df.isnull_sum != test.shape[0]]['fea']) - set(drop_list))
test = test[fea_list]
###去掉无用特征后给每个样本打上label,下次直接读取
##这里只处理了18年456月份的数据,其他同理,但是18年7月份的数据因为tag的时间也只到了七月份不好直接打label
def get_label(df):
df = df[fea_list]
df['dt'] = df['dt'].apply(lambda x:''.join(str(x)[0:4] +'-'+ str(x)[4:6] +'-'+ str(x)[6:]))
df['dt'] = pd.to_datetime(df['dt'])
df = df.merge(tag[['serial_number','model','fault_time']],how = 'left',on =['serial_number','model'])
df['diff_day'] = (df['fault_time'] - df['dt']).dt.days
df['label'] = 0
df.loc[(df['diff_day']>=0)&(df['diff_day']<=30),'label'] = 1
return df
train_2018_6 = pd.read_csv('./disk_sample_smart_log_2018_Q2/disk_sample_smart_log_201806.csv')
train_2018_6 = get_label(train_2018_6)
joblib.dump(train_2018_6, './disk_sample_smart_log_2018_Q2/train_2018_6.jl.z')
train_2018_5 = pd.read_csv('./disk_sample_smart_log_2018_Q2/disk_sample_smart_log_201805.csv')
train_2018_5 = get_label(train_2018_5)
joblib.dump(train_2018_5, './disk_sample_smart_log_2018_Q2/train_2018_5.jl.z')
train_2018_4 = pd.read_csv('./disk_sample_smart_log_2018_Q2/disk_sample_smart_log_201806.csv')
train_2018_4 = get_label(train_2018_6)
joblib.dump(train_2018_4, './disk_sample_smart_log_2018_Q2/train_2018_6.jl.z')
###这里只搞了三个月的数据其他一样处理
###提取出每个硬盘最早出现的时间日期,内存不够的话,只能一个一个读取
train_2018_7 = joblib.load('./disk_sample_smart_log_2018_Q2/train_2018_7.jl.z')
train_2018_6 = joblib.load('./disk_sample_smart_log_2018_Q2/train_2018_6.jl.z')
train_2018_5 = joblib.load('./disk_sample_smart_log_2018_Q2/train_2018_5.jl.z')
train_2018_4 = joblib.load('./disk_sample_smart_log_2018_Q2/train_2018_4.jl.z')
train_2018_3 = joblib.load('./disk_sample_smart_log_2018_Q1/train_2018_3.jl.z')
train_2018_2 = joblib.load('./disk_sample_smart_log_2018_Q1/train_2018_2.jl.z')
train_2018_1 = joblib.load('./disk_sample_smart_log_2018_Q1/train_2018_1.jl.z')
train_2017_7 = joblib.load('./disk_sample_smart_log_2017/train_2017_7.jl.z')
train_2017_8 = joblib.load('./disk_sample_smart_log_2017/train_2017_8.jl.z')
train_2017_9 = joblib.load('./disk_sample_smart_log_2017/train_2017_9.jl.z')
train_2017_10 = joblib.load('./disk_sample_smart_log_2017/train_2017_10.jl.z')
train_2017_11 = joblib.load('./disk_sample_smart_log_2017/train_2017_11.jl.z')
train_2017_12 = joblib.load('./disk_sample_smart_log_2017/train_2017_12.jl.z')
serial_2017_7 = train_2017_7[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2017_8 = train_2017_8[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2017_9 = train_2017_9[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2017_10 = train_2017_10[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2017_11 = train_2017_11[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2017_12 = train_2017_12[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2018_1 = train_2018_1[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2018_2 = train_2018_2[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2018_3 = train_2018_3[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2018_4 = train_2018_4[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2018_5 = train_2018_5[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2018_6 = train_2018_6[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2018_7 = train_2018_7[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial_2018_8 = test[['serial_number','dt']].sort_values('dt').drop_duplicates('serial_number')
serial = pd.concat((serial_2017_7,serial_2017_8,serial_2017_9,serial_2017_10,serial_2017_11,serial_2017_12),axis = 0)
serial = pd.concat((serial,serial_2018_1,serial_2018_2,serial_2018_3,serial_2018_4,serial_2018_5,serial_2018_6,serial_2018_7,serial_2018_8),axis = 0)
serial = serial.sort_values('dt').drop_duplicates('serial_number').reset_index(drop=True)
serial.columns = ['serial_number','dt_first']
serial.dt_first = pd.to_datetime(serial.dt_first)
serial.to_csv("serial.csv",index=False
本题是给定一段连续采集(天粒度)的硬盘状态监控数据(Self-Monitoring, Analysis, and Reporting Technology; often written as SMART)以及故障标签数据,参赛者需要自己提出方案,按天粒度判断每块硬盘是否会在未来30日内发生故障。例如,可以将预测故障问题转化为传统的二分类问题,通过分类模型来判断哪些硬盘会坏;因为七月份的数据不知道label,所以线下验证的时候也必须模拟线上隔一个月用四月份训练,六月份验证。
import numpy as np
import pandas as pd
import gc
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cbt
import math
from lightgbm.sklearn import LGBMClassifier
from collections import Counter
import time
from scipy.stats import kurtosis,iqr
from scipy import ptp
from tqdm import tqdm
import datetime
from sklearn.metrics import accuracy_score, roc_auc_score,log_loss,f1_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from datetime import datetime,timedelta
import warnings
import os
import joblib
pd.options.display.max_columns = None
pd.options.display.max_rows = None
test = pd.read_csv('./disk_sample_smart_log_test_a.csv/disk_sample_smart_log_test_a.csv')
tag = pd.read_csv('./disk_sample_fault_tag.csv/tag.csv')
test['dt'] = test['dt'].apply(lambda x:''.join(str(x)[0:4] +'-'+ str(x)[4:6] +'-'+ str(x)[6:]))
test['dt'] = pd.to_datetime(test['dt'])
tag['fault_time'] = pd.to_datetime(tag['fault_time'])
test = test.sort_values(['serial_number','dt'])
test = test.drop_duplicates().reset_index(drop=True)
sub = test[['manufacturer','model','serial_number','dt']]
serial = pd.read_csv('serial.csv')
serial.columns = ['serial_number','dt_first']
serial.dt_first = pd.to_datetime(serial.dt_first)
tag['tag'] = tag['tag'].astype(str)
tag = tag.groupby(['serial_number','fault_time','model'])['tag'].apply(lambda x :'|'.join(x)).reset_index()
tag.columns = ['serial_number','fault_time_1','model','tag']
tag.model = tag.model.map({1:2,2:1})
map_dict = dict(zip(tag['tag'].unique(), range(tag['tag'].nunique())))
tag['tag'] = tag['tag'].map(map_dict).fillna(-1).astype('int32')
###用到的特征
feature_name = [i for i in test.columns if i not in ['dt','manufacturer']] + ['days','days_1','days_2','tag']
###验证
train_x = train_2018_4
del train_2018_4
train_x = train_x.append(train_2018_3).reset_index(drop=True)
del train_2018_3
train_x = train_x.merge(serial,how = 'left',on = 'serial_number')
###硬盘的使用时常
train_x['days'] = (train_x['dt'] - train_x['dt_first']).dt.days
train_x = train_x.merge(tag,how = 'left',on = ['serial_number','model'])
###当前时间与另一个model故障的时间差,
train_x['days_1'] = (train_x['dt'] - train_x['fault_time_1']).dt.days
train_x.loc[train_x.days_1 <= 0,'tag'] = None
train_x.loc[train_x.days_1 <= 0,'days_1'] = None
###同一硬盘第一次使用到开始故障的时间
train_x['days_2'] = (train_x['fault_time_1'] - train_x['dt_first']).dt.days
train_x.loc[train_x.fault_time_1 >= train_x.dt,'days_2'] = None
train_x['serial_number'] = train_x['serial_number'].apply(lambda x:int(x.split('_')[1]))
train_y = train_x.label.values
train_x = train_x[feature_name]
gc.collect()
val_x = train_2018_6
del train_2018_6
val_x = val_x.merge(serial,how = 'left',on = 'serial_number')
val_x['days'] = (val_x['dt'] - val_x['dt_first']).dt.days
val_x = val_x.merge(tag,how = 'left',on = ['serial_number','model'])
val_x['days_1'] = (val_x['dt'] - val_x['fault_time_1']).dt.days
val_x.loc[val_x.days_1 <= 0,'tag'] = None
val_x.loc[val_x.days_1 <= 0,'days_1'] = None
val_x['days_2'] = (val_x['fault_time_1'] - val_x['dt_first']).dt.days
val_x.loc[val_x.fault_time_1 >= val_x.dt,'days_2'] = None
val_x['serial_number'] = val_x['serial_number'].apply(lambda x:int(x.split('_')[1]))
val_y = val_x.label.values
val_x = val_x[feature_name]
gc.collect()
gc.collect()
clf = LGBMClassifier(
learning_rate=0.001,
n_estimators=10000,
num_leaves=127,
subsample=0.8,
colsample_bytree=0.8,
random_state=2019,
is_unbalenced = 'True',
metric=None
)
print('************** training **************')
print(train_x.shape,val_x.shape)
clf.fit(
train_x, train_y,
eval_set=[(val_x, val_y)],
eval_metric='auc',
early_stopping_rounds=50,
verbose=100
)
##根据验证的轮数训练
train_df = train_2018_6.append(train_2018_5).reset_index(drop=True)
del train_2018_6
del train_2018_5
train_df = train_df.append(train_2018_4).reset_index(drop=True)
del train_2018_4
train_df = train_df.merge(serial,how = 'left',on = 'serial_number')
train_df['days'] = (train_df['dt'] - train_df['dt_first']).dt.days
train_df = train_df.merge(tag,how = 'left',on = ['serial_number','model'])
train_df['days_1'] = (train_df['dt'] - train_df['fault_time_1']).dt.days
train_df.loc[train_df.days_1 <= 0,'tag'] = None
train_df.loc[train_df.days_1 <= 0,'days_1'] = None
train_df['days_2'] = (train_df['fault_time_1'] - train_df['dt_first']).dt.days
train_df.loc[train_df.fault_time_1 >= train_df.dt,'days_2'] = None
train_df['serial_number'] = train_df['serial_number'].apply(lambda x:int(x.split('_')[1]))
labels = train_df.label.values
train_df = train_df[feature_name]
gc.collect()
test = test.merge(serial,how = 'left',on = 'serial_number')
test['days'] = (test['dt'] - test['dt_first']).dt.days
test = test.merge(tag,how = 'left',on = ['serial_number','model'])
test['days_1'] = (test['dt'] - test['fault_time_1']).dt.days
test.loc[test.days_1 <= 0,'tag'] = None
test.loc[test.days_1 <= 0,'days_1'] = None
test['days_2'] = (test['fault_time_1'] - test['dt_first']).dt.days
test.loc[test.fault_time_1 >= test.dt,'days_2'] = None
test['serial_number'] = test['serial_number'].apply(lambda x:int(x.split('_')[1]))
test_x = test[feature_name]
del test
gc.collect()
clf = LGBMClassifier(
learning_rate=0.001,
n_estimators=100,
num_leaves=127,
subsample=0.8,
colsample_bytree=0.8,
random_state=2019,
# is_unbalenced = 'True',
metric=None
)
print('************** training **************')
print(train_df.shape,test_x.shape)
clf.fit(
train_df, labels,
eval_set=[(train_df, labels)],
eval_metric='auc',
early_stopping_rounds=10,
verbose=10
)
sub['p'] = clf.predict_proba(test_x)[:,1]
sub['label'] = sub['p'].rank()
sub['label']= (sub['label']>=sub.shape[0] * 0.996).astype(int)
submit = sub.loc[sub.label == 1]
###这里我取故障率最大的一天进行提交,线上测了几个阈值,100个左右好像比较好。。。。
submit = submit.sort_values('p',ascending=False)
submit = submit.drop_duplicates(['serial_number','model'])
submit[['manufacturer','model','serial_number','dt']].to_csv("sub.csv",index=False,header = None)
submit.shape
一些其他思路 可以试试多分类,label分为1,2....30天,超过30,也可也试试回归做。
点击阅读原文,报名链接。