Bootstrap

lightGBM的使用示例

lightGBM

使用LightGBM回归器对鸢尾花数据集进行了训练和预测。

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

#获取数据
iris = load_iris()

#数据基本处理
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)

#模型训练
gbm = lgb.LGBMRegressor(objective='regression',learning_rate=0.05,n_estimators=20)
gbm.fit(x_train,y_train,eval_set=[(x_test,y_test)],eval_metric='l1',early_stopping_rounds=5)
gbm.score(x_test,y_test)

#通过网格搜索进行训练
estimators = lgb.LGBMRegressor(num_leaves=31)
param_grid = {"learning_rate":[0.01,0.1,1],"n_estimators":[20,40,60,80]}
gbm = GridSearchCV(estimators,param_grid=param_grid,cv=5)
gbm.fit(x_train,y_train)
#输出最好的参数
print(gbm.best_params_)

#模型再训练
gbm = lgb.LGBMRegressor(objective='regression',learning_rate=0.1,n_estimators=20)
gbm.fit(x_train,y_train,eval_set=[(x_test,y_test)],eval_metric='l1',early_stopping_rounds=3)
gbm.score(x_test,y_test)

PUBG-RF+lightGBM案例

使用随机森林、lightGBM等机器学习模型对吃鸡游戏比赛中玩家排名预测数据进行训练和评估,并使用了GridSearchCV进行参数调优。
在数据预处理方面,使用了缺失值处理、异常值处理、类别型数据处理等技术对数据进行了基本处理和特征工程。
在模型训练方面,先使用了随机森林模型进行初步训练和特征重要性分析,然后使用lightGBM模型进行了调优,得到了最优的参数组合。其中,使用了GridSearchCV和绘制曲线的技术对参数进行调优。

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

#获取数据
train = pd.read_csv("数据路径.csv")

#数据基本处理
#数据缺失值处理
#查看缺失值
np.any(train.isnull())
#发现"winPlacePerc"列有缺失值
#寻找缺失值行
train[train['winPlacePerc'].isnull()]
#删除缺失值
train.drop(行号,inplace=True)
#显示每场比赛参加人数
count = train.groupby('matchId')['matchId'].transform('count')
train['playersJoined'] = count
#显示总参加人数
print(count.count())
#通过每场参加人数进行升序排列
train['playersJoined'].sort_values()
#通过绘制图像查看每局开始人数
plt.figure(figsize=(20,10))
sns.countplot(train['playersJoined'])
plt.title('playersJoined')
plt.grid()
plt.show()
#只绘制每局参加人数>=75的直方图
plt.figure(figsize=(20,10))
sns.countplot(train[train['playersJoined']>=75]['playersJoined'])
plt.title('playersJoined')
plt.grid()
plt.show()
#规范化输出部分数据
train["killsNorm"] = train["kills"] * ((100-train["playersJoined"])/100+1)
train["damageDealtNorm"] = train["damageDealt"] * ((100-train["playersJoined"])/100+1)
train["maxPlaceNorm"] = train["maxPlace"] * ((100-train["playersJoined"])/100+1)
train["matchDurationNorm"] = train["matchDuration"] * ((100-train["playersJoined"])/100+1)
#比较经过规范化的特征值数据
to_show = ['Id','kills','killsNorm','damageDealt','damageDealtNorm','maxPlace','maxPlaceNorm','matchDuration','matchDurationNorm']
print(train[to_show][0:11])
#部分变量合成
train["healsandboosts"] = train["heals"] + train["boosts"]

#异常值处理
#删除有击杀但完全没有移动的玩家
train["totalDistance"] = train["rideDistance"] + train["walkDistance"] + train["swimDistance"]
train["killwithoutMoving"] = (train["kills"]>0) & (train["totalDistance"] == 0)
train.drop(train[train["killwithoutMoving"] == True].index,inplace=True)
#删除驾车杀敌数异常的数据
train.drop(train[train["roadKills"] > 10].index,inplace=True)
#删除一局游戏击杀数>30的数据
train.drop(train[train["kills"] > 30].index,inplace=True)
#删除爆头率百分比的数据
#创建变量爆头率
train['headshot_rate'] = train['headshotKills'] / train['kills']
train['headshot_rate'] = train['headshot_rate'].fillna(0)
train.drop(train[(train['headshot_rate'] == 1) & (train['kills'] > 9)].index,inplasce=True)
#删除最远击杀距离大于等于1km的数据
train.drop(train[train['longestKill'] >= 1000].index,inplace=True)
#删除关于运动距离异常的数据
#行走距离异常删除
train.drop(train[train["walkDistance"] >= 10000].index,inplace=True)
#载具距离异常删除
train.drop(train[train["rideDistance"] >= 20000].index,inplace=True)
#游泳距离异常删除
train.drop(train[train["swimDistance"] >= 2000].index,inplace=True)
#武器收集异常删除
train.drop(train[train["weaponsAcquired"] >= 80].index,inplace=True)
#使用治疗药品数量异常值删除
train.drop(train[train["heals"] >= 80].index,inplace=True)
#类别型数据处理
#比赛类型one-hot处理
train = pd.get_dummies(train,columns=["matchType"])
matchType_encoding = train.filter(regex="matchType")
#对groupId,matchId进行处理
train["groupId"] = train["groupId"].astype("category")
train["groupId_cat"] = train["groupId"].cat.codes
train["matchId"] = train["matchId"].astype("category")
train["matchId_cat"] = train["matchId"].cat.codes
train.drop(["groupId","matchId"],axis=1,inplace=True)
#数据截取
#截取前100万条数据进行训练
df_sample = train.sample(1000000)
#确定目标值和特征值
df = df_sample.drop(["winPlacePerc","Id"],axis=1)
y = df_sample["winPlacePerc"]
#分割数据集和测试集
x_train,x_vaild,y_train,y_vaild = train_test_split(df,y,test_size=0.2)

#机器学习(模型训练和评估)
#初步使用随机森林对模型进行训练
m1 = RandomForestRegressor(n_estimators=40,
                           min_samples_leaf=3,
                           max_features='sqrt',
                           n_jobs=-1)
m1.fit(x_train,y_train)
#输出训练结果
y_pre = m1.predict(x_valid)
print(m1.score(x_valid,y_valid))
print(mean_absolute_error(y_valid,y_pre))

#将特征与排名的相关程度构建出来
imp_df = pd.DataFrame({"cols":df.columns,"imp":m1.feature_importances_})
#进行降序排列
imp_df = imp_df.sort_values("imp",ascending=False)
#只取排名前20个特征并可视化展示
imp_df[:20].plot("cols","imp",figsize=(20,8),kind="barh")
#保留比较重要的特征
to_keep = imp_df[imp_df.imp > 0.005].cols
df_keep = df[to_keep]

#重新分割数据集和测试集
x_train,x_vaild,y_train,y_vaild = train_test_split(df_keep,y,test_size=0.2)
#再次使用随机森林对模型进行训练
m2 = RandomForestRegressor(n_estimators=40,
                           min_samples_leaf=3,
                           max_features='sqrt',
                           n_jobs=-1)
m2.fit(x_train,y_train)
#输出训练结果
y_pre = m2.predict(x_valid)
print(m2.score(x_valid,y_valid))
print(mean_absolute_error(y_valid,y_pre))

#使用lightGBM对模型进行训练
#分割数据集和测试集
x_train,x_vaild,y_train,y_vaild = train_test_split(df,y,test_size=0.2)
#模型初次尝试
gbm = lgb.LGBMRegressor(objective="regression",
                        num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],eval_metric="l1",early_stopping_rounds=5)
#输出训练结果
y_pre = gbm.predict(x_valid,num_iteration=gbm.best_iteration_)
print(mean_absolute_error(y_valid,y_pre))

#模型二次调优
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
    "learning_rate":[0.01,0.1,1],
    "n_estimators":[40,60,80,100,200,300]
}
gbm = GridSearchCV(estimator,param_grid,cv=5,n_jobs=-1)
gbm.fit(x_train,y_train)
#输出训练结果
y_pre = gbm.predict(x_valid)
print(mean_absolute_error(y_valid,y_pre))
#输出最好结果的参数
print(gbm.best_params_)

#模型三次调优
#最优的n_estimators
scores = []
n_estimators = [100,300,500,800]
for nes in n_estimators:
    lgbm = lgb.LGBMRegressor(boosting_type="gbdt",
                              num_leaves=31,
                              max_depth=5,
                              learning_rate=0.1,
                              n_estimators=nes,
                              min_child_samples=20,
                              n_jobs=-1)
    lgb.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],eval_metric="l1",early_stopping_rounds=5)
    y_pre = lgbm.predict(x_valid)
    mae = mean_absolute_error(y_valid,y_pre)
    scores.append(mae)
    print("本次结果输出的mae值是:\n",mae)
    
#结果可视化
plt.plot(n_estimators,scores,'o-')
plt.ylabel("mae")
plt.xlabel("n_estimator")
print("best n_estimator {}".format(n_estimators[np.argmin(scores)]))

#最优的max_depth
scores = []
max_depth = [3,5,7,9,11]
for md in n_estimators:
    lgbm = lgb.LGBMRegressor(boosting_type="gbdt",
                              num_leaves=31,
                              max_depth=md,
                              learning_rate=0.1,
                              n_estimators=500,
                              min_child_samples=20,
                              n_jobs=-1)
    lgb.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],eval_metric="l1",early_stopping_rounds=5)
    y_pre = lgbm.predict(x_valid)
    mae = mean_absolute_error(y_valid,y_pre)
    scores.append(mae)
    print("本次结果输出的mae值是:\n",mae)
    
#结果可视化
plt.plot(max_depth,scores,'o-')
plt.ylabel("mae")
plt.xlabel("max_depth")
print("best max_depth {}".format(max_depth[np.argmin(scores)]))
;