Bootstrap

GBDT+LR记录- 9.5代码训练GBDT模型

GBDT+LR记录

9.5代码训练GBDT模型

1.数据集
在这里插入图片描述
2.代码ana_train

# -*- coding: utf-8 -*-

"""
==================================================
   File Name:     ana_train.data
  Description :  树模型,不需要对 连续特征进行离散化,只需要对离散特征进行01编码,gbdt特征 的预处理阶段
==================================================
"""

import sys
import numpy as np
import pandas as pd


def get_input(input_train_file, input_test_file):
    dtype_dict = {
   "age": np.int32, "education-num": np.int32, "capital-gain": np.int32, "capital-loss": np.int32,
                  "hour-per-week": np.int32}
    use_list = [i for i in range(15)]
    use_list.remove(2)
    #print(use_list)

    train_data_df = pd.read_csv(input_train_file, sep=',', header=0, dtype=dtype_dict, na_values="?", usecols=use_list)
    train_data_df = train_data_df.dropna(axis=0, how="any")
    test_data_df = pd.read_csv(input_test_file, sep=',', header=0, dtype=dtype_dict, na_values="?", usecols=use_list)
    test_data_df = test_data_df.dropna(axis=0, how="any")
    #print(train_data_df.shape, test_data_df.shape)
    return train_data_df, test_data_df


def label_trans(x):
    if x.strip() == "<=50K":
        return "0"
    elif x.strip() == ">50K":
        return "1"
    else:
        return "0"


def process_label_feature(label_feature_str, df_in):
    """处理label"""
    df_in.loc[:, label_feature_str] = df_in.loc[:, label_feature_str].apply(label_trans)
    # df_in = df_in.drop(columns=['wage-class'], axis=1)
    #zin9-5注释了下边
    return df_in





def dict_trans(dict_in):
    output_dict = {
   }
    index2 = 0
    for index in sorted(dict_in.items(), key=lambda x: x[1], reverse=True):
        output_dict[index[0]] = index2
        index2 += 1
    return output_dict


def dis_to_feature(x, feature_dict):  # one-hot编码
    output_list = [0] * len(feature_dict)
    if x not in feature_dict:
        return ",".join([str(ele) for ele in output_list])  # 没有找到相应位置:[0,0,0,0,0,0]
    else:
        index = feature_dict[x]
        output_list[index] = 1
    return ",".join([str(ele) for ele in output_list])  # 找到相应位置[1,0,0,0,0]

def process_dis_feature(label_feature_str, df_train, df_test):
    """特征的离散化"""

    origin_dict = df_train.loc[:, label_feature_str].value_counts().to_dict()
    feature_dict = dict_trans(origin_dict)
    # print(feature_dict)
    df_train.loc[:, label_feature_str] = df_train.loc[:, label_feature_str].apply(dis_to_feature, args=(feature_dict,))
    df_test.loc[:, label_feature_str] = df_test.loc[:, label_feature_str].apply(dis_to_feature, args=(feature_dict,))
    # print(df_train.loc[:5, label_feature_str])
    return len(feature_dict)

;