GBDT+LR记录
9.5代码训练GBDT模型
1.数据集
2.代码ana_train
# -*- coding: utf-8 -*-
"""
==================================================
File Name: ana_train.data
Description : 树模型,不需要对 连续特征进行离散化,只需要对离散特征进行01编码,gbdt特征 的预处理阶段
==================================================
"""
import sys
import numpy as np
import pandas as pd
def get_input(input_train_file, input_test_file):
dtype_dict = {
"age": np.int32, "education-num": np.int32, "capital-gain": np.int32, "capital-loss": np.int32,
"hour-per-week": np.int32}
use_list = [i for i in range(15)]
use_list.remove(2)
#print(use_list)
train_data_df = pd.read_csv(input_train_file, sep=',', header=0, dtype=dtype_dict, na_values="?", usecols=use_list)
train_data_df = train_data_df.dropna(axis=0, how="any")
test_data_df = pd.read_csv(input_test_file, sep=',', header=0, dtype=dtype_dict, na_values="?", usecols=use_list)
test_data_df = test_data_df.dropna(axis=0, how="any")
#print(train_data_df.shape, test_data_df.shape)
return train_data_df, test_data_df
def label_trans(x):
if x.strip() == "<=50K":
return "0"
elif x.strip() == ">50K":
return "1"
else:
return "0"
def process_label_feature(label_feature_str, df_in):
"""处理label"""
df_in.loc[:, label_feature_str] = df_in.loc[:, label_feature_str].apply(label_trans)
# df_in = df_in.drop(columns=['wage-class'], axis=1)
#zin9-5注释了下边
return df_in
def dict_trans(dict_in):
output_dict = {
}
index2 = 0
for index in sorted(dict_in.items(), key=lambda x: x[1], reverse=True):
output_dict[index[0]] = index2
index2 += 1
return output_dict
def dis_to_feature(x, feature_dict): # one-hot编码
output_list = [0] * len(feature_dict)
if x not in feature_dict:
return ",".join([str(ele) for ele in output_list]) # 没有找到相应位置:[0,0,0,0,0,0]
else:
index = feature_dict[x]
output_list[index] = 1
return ",".join([str(ele) for ele in output_list]) # 找到相应位置[1,0,0,0,0]
def process_dis_feature(label_feature_str, df_train, df_test):
"""特征的离散化"""
origin_dict = df_train.loc[:, label_feature_str].value_counts().to_dict()
feature_dict = dict_trans(origin_dict)
# print(feature_dict)
df_train.loc[:, label_feature_str] = df_train.loc[:, label_feature_str].apply(dis_to_feature, args=(feature_dict,))
df_test.loc[:, label_feature_str] = df_test.loc[:, label_feature_str].apply(dis_to_feature, args=(feature_dict,))
# print(df_train.loc[:5, label_feature_str])
return len(feature_dict)