Bootstrap

20200317_决策树预测贷款申请

使用决策树,预测贷款申请

import pandas as pd
#  忽略弹出的warnings
import warnings
warnings.filterwarnings('ignore')  
text=pd.read_excel('data/LoanStats_securev1_2019Q4.xlsx')
text.head()
idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_gradeemp_title...num_tl_90g_dpd_24mnum_tl_op_past_12mpct_tl_nvr_dlqpercent_bc_gt_75pub_rec_bankruptciestax_lienstot_hi_cred_limtotal_bal_ex_morttotal_bc_limittotal_il_high_credit_limit
016402747320000200002000036 months0.1240668.12BB4NaN...02100.050.0106080042566520040000.0
116398441316500165001650060 months0.1033353.27BB1NaN...00100.00.000223390409134050039890.0
216419322575007500750036 months0.1240250.55BB4Rn...0754.516.7001384681021224770090768.0
316294873619000190001897536 months0.0646581.99AA1Tech Ops Analyst...00100.040.000184034284613840035000.0
416416168610000100001000036 months0.2055374.45DD2Planner...02100.016.70063937316151624600172818.0

5 rows × 114 columns

目标变量

text['loan_status'].value_counts()
Current               122625
Fully Paid              3539
In Grace Period         1079
Late (31-120 days)       509
Late (16-30 days)        304
Charged Off               80
n                          1
Name: loan_status, dtype: int64
#0为已经完成的
def function(x):
    if 'Current' in x:
        return 0
    elif 'Fully Paid' in x:
        return 0
    else:
        return 1
text['loan_status']=text.apply(lambda x:function(x['loan_status']),axis=1)
text['loan_status'].value_counts()
0    126164
1      1973
Name: loan_status, dtype: int64
pos_trainDf = text[text['loan_status'] == 1]
neg_trainDf = text[text['loan_status'] == 0].sample(n=4000, random_state=2018)
text = pd.concat([pos_trainDf, neg_trainDf], axis=0).sample(frac=1.0,random_state=2018)
text.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5973 entries, 18821 to 92872
Columns: 114 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(36), int64(50), object(27)
memory usage: 5.2+ MB

缺失值查看

check_null = text.isnull().sum(axis=0).sort_values(ascending=False)/float(len(text)) #查看缺失值比例
print(check_null[check_null >0.2]) # 查看缺失比例大于20%的属性。
desc                              0.999833
mths_since_last_record            0.899046
verification_status_joint         0.880629
annual_inc_joint                  0.864055
dti_joint                         0.864055
mths_since_recent_bc_dlq          0.794408
mths_since_last_major_derog       0.769965
mths_since_recent_revol_delinq    0.703164
mths_since_last_delinq            0.548468
dtype: float64
thresh_count = len(text)*0.4 # 设定阀值
data = text.dropna(thresh=thresh_count, axis=1 ) #若某一列数据缺失的数量超过阀值就会被删除
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5973 entries, 18821 to 92872
Columns: 106 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(30), int64(50), object(25)
memory usage: 4.9+ MB

删除无意义的列

sub_grade:与Grade的信息重复

emp_title :缺失值较多,同时不能反映借款人收入或资产的真实情况

zip_code:地址邮编,邮编显示不全,没有意义

addr_state:申请地址所属州,不能反映借款人的偿债能力

last_credit_pull_d :LendingClub平台最近一个提供贷款的时间,没有意义

policy_code : 变量信息全为1

pymnt_plan 基本是n

title: title与purpose的信息重复,同时title的分类信息更加离散

next_pymnt_d : 下一个付款时间,没有意义

policy_code : 没有意义

collection_recovery_fee: 全为0,没有意义

earliest_cr_line : 记录的是借款人发生第一笔借款的时间

issue_d : 贷款发行时间,这里提前向模型泄露了信息

last_pymnt_d、collection_recovery_fee、last_pymnt_amnt: 预测贷款违约模型是贷款前的风险控制手段,这些贷后信息都会影响我们训练模型的效果,在此将这些信息删除

drop_list = ['sub_grade', 'emp_title',  'title', 'zip_code', 'addr_state', 
             'mths_since_last_delinq' ,'initial_list_status','title','issue_d','last_pymnt_d','last_pymnt_amnt',
             'next_pymnt_d','last_credit_pull_d','policy_code','collection_recovery_fee', 'earliest_cr_line']
data.drop(drop_list, axis=1, inplace = True)
data.head()
idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradeemp_lengthhome_ownership...num_tl_90g_dpd_24mnum_tl_op_past_12mpct_tl_nvr_dlqpercent_bc_gt_75pub_rec_bankruptciestax_lienstot_hi_cred_limtotal_bal_ex_morttotal_bc_limittotal_il_high_credit_limit
1882116342589845004500450036 months0.1612158.48CNaNRENT...02100.028.6004470010872328000.0
6123416190836620000200002000060 months0.2305564.39DNaNOWN...00100.033.30054349195721040022349.0
11978115990142710000100001000060 months0.1862257.32D6 yearsOWN...03100.00.0006907748184960049477.0
4920116229259121000210002100060 months0.1430491.91C< 1 yearRENT...00100.00.000109894666623380067194.0
5372716215420840000400004000060 months0.0819814.70A10+ yearsRENT...00100.050.0002073701609859800061725.0

5 rows × 91 columns

分类变量

objectColumns = data.select_dtypes(include=["object"]).columns
data[objectColumns].isnull().sum().sort_values(ascending=False)
emp_length             572
application_type         1
url                      1
total_acc                0
delinq_2yrs              0
purpose                  0
pymnt_plan               0
verification_status      0
annual_inc               0
home_ownership           0
grade                    0
term                     0
dtype: int64
# data['int_rate'] = data['int_rate'].str.rstrip('%').astype('float')
# data['revol_util'] = data['revol_util'].str.rstrip('%').astype('float')
# data['annual_inc'] = data['annual_inc'].str.replace(",","").astype('float')
import numpy as np
objectColumns = data.select_dtypes(include=["object"]).columns # 筛选数据类型为object的数据
data[objectColumns] = data[objectColumns].fillna("Unknown") #以分类“Unknown”填充缺失值
import missingno as msno
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
%matplotlib inline
msno.bar(data[objectColumns]) #可视化
<matplotlib.axes._subplots.AxesSubplot at 0x2cacc08aa20>

在这里插入图片描述

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    },
    "grade":{
        "A": 1,
        "B": 2,
        "C": 3,
        "D": 4,
        "E": 5,
        "F": 6,
        "G": 7
    }
}
data = data.replace(mapping_dict) #变量映射

数值类型缺失值

data.select_dtypes(include=[np.number]).isnull().sum().sort_values(ascending=False)
il_util                  883
mths_since_recent_inq    655
mo_sin_old_il_acct       203
mths_since_rcnt_il       203
bc_util                  109
                        ... 
total_cu_tl                0
inq_fi                     0
total_rev_hi_lim           0
total_bc_limit             0
id                         0
Length: 80, dtype: int64
numColumns = data.select_dtypes(include=[np.number]).columns
msno.matrix(data[numColumns]) #缺失值可视化
<matplotlib.axes._subplots.AxesSubplot at 0x2caecfe1160>

在这里插入图片描述

data.select_dtypes(include=[np.number])
idloan_amntfunded_amntfunded_amnt_invint_rateinstallmentgradeloan_statusdtifico_range_low...num_tl_90g_dpd_24mnum_tl_op_past_12mpct_tl_nvr_dlqpercent_bc_gt_75pub_rec_bankruptciestax_lienstot_hi_cred_limtotal_bal_ex_morttotal_bc_limittotal_il_high_credit_limit
188211634258984500450045000.1612158.483116.13705...02100.028.6004470010872328000.0
612341619083662000020000200000.2305564.394034.14735...00100.033.30054349195721040022349.0
1197811599014271000010000100000.1862257.324027.84680...03100.00.0006907748184960049477.0
492011622925912100021000210000.1430491.913121.82740...00100.00.000109894666623380067194.0
537271621542084000040000400000.0819814.701027.52700...00100.050.0002073701609859800061725.0
..................................................................
865471607199573000030000300000.0819611.03105.68740...02100.040.00036154846148945000.0
697341614014371600016000160000.1430549.183113.73660...0090.966.700213001502278006000.0
309471629680641600160016000.110252.402017.32715...01100.050.00063659418082720030259.0
290391630646081000010000100000.1240334.062022.91680...0266.70.00023002436479290060846.0
928721608381772300023000230000.1774580.81310.00800...00100.00.0008525506000.0

5973 rows × 80 columns

data.isnull().sum().sum()
mean_cols=data.mean()
data= data.fillna(mean_cols)

目标变量

y=data['loan_status']
x=data.drop(['loan_status'],axis=1)
#使用pandas库将类别变量编码
x =pd.get_dummies(x)
n_sample = y.shape[0]
n_pos_sample = y[y == 0].shape[0]
n_neg_sample = y[y == 1].shape[0]
print('样本个数:{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('特征维数:', x.shape[1])
样本个数:5973; 正样本占66.97%; 负样本占33.03%
特征维数: 7167

特征工程

#数据进行分割(训练数据和测试数据)
from sklearn.model_selection  import train_test_split#测试集和训练集
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("训练数据集样本数目:%d, 测试数据集样本数目:%d" % (x_train.shape[0], x_test.shape[0]))
y_train = y_train.astype(np.int)
y_test = y_test.astype(np.int)
训练数据集样本数目:4778, 测试数据集样本数目:1195
#参数优化
from sklearn.pipeline import Pipeline #管道
from sklearn.model_selection import GridSearchCV #网格搜索交叉验证,用于选择最优的参数
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
pipes =Pipeline([
            ('mms', MinMaxScaler()), ## 归一化操作
            ('pca', PCA()), ## 降纬
            ('RandomForestClassifier', RandomForestClassifier(criterion='gini'))
        ])
# 参数
#
# estimators = [1,50,100,500]
# depth = [1,2,3,7,15]
parameters = [
    {
    "pca__n_components": [1,2,3,4],
    "RandomForestClassifier__n_estimators":[1,50,100,500],
    "RandomForestClassifier__max_depth":[1,2,3,7,15]
    }
]
#获取数据
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
gscv = GridSearchCV(pipes, param_grid=parameters)
gscv.fit(x_train2, y_train2)
print ("score值:",gscv.best_score_,"最优参数列表:", gscv.best_params_)
score值: 0.6720405704396591 最优参数列表: {'RandomForestClassifier__max_depth': 7, 'RandomForestClassifier__n_estimators': 500, 'pca__n_components': 4}
#标准化
ss = MinMaxScaler()#分类模型,经常使用的是minmaxscaler归一化,回归模型经常用standardscaler
x_train = ss.fit_transform(x_train, y_train)
x_test = ss.transform(x_test)
x_train.shape
(4778, 7167)
#降维
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)
[0.08187674 0.05705152 0.05380546 0.04683824]
#随机森林模型
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=2000, criterion='gini', max_depth=7, random_state=0)
forest.fit(x_train, y_train)#max_depth一般不宜设置过大,把每个模型作为一个弱分类器
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
#模型效果评估
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
score = forest.score(x_test, y_test)
print ("准确率:%.2f%%" % (score * 100))
#模型预测
y_score = forest.predict(x_test)# prodict_proba输出概率
准确率:66.78%
# Compute ROC curve and ROC area for each class
import matplotlib.pyplot as plt
fpr,tpr,threshold = roc_curve(y_test, y_score) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
print('auc:%.2f'%(roc_auc))
auc:0.51
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
<Figure size 432x288 with 0 Axes>

在这里插入图片描述

决策树

#参数优化
from sklearn.tree import DecisionTreeClassifier
pipe = Pipeline([
            ('mms', MinMaxScaler()),
            ('pca', PCA()),
            ('decision', DecisionTreeClassifier(random_state=0))
        ])

# 参数
parameters = {
    "pca__n_components": [0.5,0.99],#设置为浮点数代表主成分方差所占最小比例的阈值
    "decision__criterion": ["gini", "entropy"],
    "decision__max_depth": [1,2,3,4,5,6,7,8,9,10]
}
#数据
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
#模型构建:通过网格交叉验证,寻找最优参数列表, param_grid可选参数列表,cv:进行几折交叉验证
gscv = GridSearchCV(pipe, param_grid=parameters,cv=3)
#模型训练
gscv.fit(x_train2, y_train2)
#算法的最优解
print("最优参数列表:", gscv.best_params_)
print("score值:",gscv.best_score_)
最优参数列表: {'decision__criterion': 'gini', 'decision__max_depth': 4, 'pca__n_components': 0.99}
score值: 0.6917121178186392
#降维
from sklearn.decomposition import PCA
pca = PCA(n_components= 0.99)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)
[0.34176263 0.23813938 0.22458996 0.19550803]
tree = DecisionTreeClassifier(criterion='gini', max_depth=4)
tree.fit(x_train, y_train) # fit模型训练
# 模型相关的指标输出
# print("训练集上的准确率:%.3f" % tree.score(x_train, y_train))
y_hat = tree.predict(x_test) # 获取预测值
print("准确率:%.3f" % (np.mean(y_hat == y_test)))
准确率:0.671
# Compute ROC curve and ROC area for each class
import matplotlib.pyplot as plt
fpr,tpr,threshold = roc_curve(y_test, y_hat) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
print('auc:%.2f'%(roc_auc))
auc:0.51
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
<Figure size 432x288 with 0 Axes>

在这里插入图片描述

;