Bootstrap

机器学习笔记-房价预测实例

机器学习笔记

摘自kaggle初学者教程
所用数据集为一个房价预测的数据集

import pandas as pd

main_file_path = 'dataset/kaggle_house_prices/train.csv'
data = pd.read_csv(main_file_path)
#删除数据集中的缺失值
# data=data.dropna(axis=0)
#显示数据集前几行
print(data.head(4))
#对数据集做简单的探索
print(data.describe())
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008        WD         Normal     208500  
1      5   2007        WD         Normal     181500  
2      9   2008        WD         Normal     223500  
3      2   2006        WD        Abnorml     140000  

[4 rows x 81 columns]
                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726   
std       1.112799    30.202904     20.645407   181.066207   456.098091   
min       1.000000  1872.000000   1950.000000     0.000000     0.000000   
25%       5.000000  1954.000000   1967.000000     0.000000     0.000000   
50%       5.000000  1973.000000   1994.000000     0.000000   383.500000   
75%       6.000000  2000.000000   2004.000000   166.000000   712.250000   
max       9.000000  2010.000000   2010.000000  1600.000000  5644.000000   

           ...         WoodDeckSF  OpenPorchSF  EnclosedPorch    3SsnPorch  \
count      ...        1460.000000  1460.000000    1460.000000  1460.000000   
mean       ...          94.244521    46.660274      21.954110     3.409589   
std        ...         125.338794    66.256028      61.119149    29.317331   
min        ...           0.000000     0.000000       0.000000     0.000000   
25%        ...           0.000000     0.000000       0.000000     0.000000   
50%        ...           0.000000    25.000000       0.000000     0.000000   
75%        ...         168.000000    68.000000       0.000000     0.000000   
max        ...         857.000000   547.000000     552.000000   508.000000   

       ScreenPorch     PoolArea       MiscVal       MoSold       YrSold  \
count  1460.000000  1460.000000   1460.000000  1460.000000  1460.000000   
mean     15.060959     2.758904     43.489041     6.321918  2007.815753   
std      55.757415    40.177307    496.123024     2.703626     1.328095   
min       0.000000     0.000000      0.000000     1.000000  2006.000000   
25%       0.000000     0.000000      0.000000     5.000000  2007.000000   
50%       0.000000     0.000000      0.000000     6.000000  2008.000000   
75%       0.000000     0.000000      0.000000     8.000000  2009.000000   
max     480.000000   738.000000  15500.000000    12.000000  2010.000000   

           SalePrice  
count    1460.000000  
mean   180921.195890  
std     79442.502883  
min     34900.000000  
25%    129975.000000  
50%    163000.000000  
75%    214000.000000  
max    755000.000000  

[8 rows x 38 columns]
#显示数据集中的所有列名
print(data.columns)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')
#将要预测的房价列取出
salePrice=data.SalePrice
print(salePrice.head())
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64
#选择数据集中的部分列
columns=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']

X=data[columns]
y=data.SalePrice

from sklearn.tree import DecisionTreeRegressor

#Define model 定义简单的决策树模型
decision_model=DecisionTreeRegressor()

#Fit model  训练模型
decision_model.fit(X,y)
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
#取前5行数据进行预测,并对比预测值和真实值
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(decision_model.predict(X.head()))
print("The real y is:")
print(y.head())
Making predictions for the following 5 houses:
   LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0     8450       2003       856       854         2             3   
1     9600       1976      1262         0         2             3   
2    11250       2001       920       866         2             3   
3     9550       1915       961       756         1             3   
4    14260       2000      1145      1053         2             4   

   TotRmsAbvGrd  
0             8  
1             6  
2             6  
3             7  
4             9  
The predictions are
[ 208500.  181500.  223500.  140000.  250000.]
The real y is:
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64
#使用MAE度量模型的误差
from sklearn.metrics import mean_absolute_error
#决策树模型的参数max_leaf_nodes会影响模型表现,自定义函数探究该变量
def get_mae(max_leaf_nodes, train_X, val_X,train_y,val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)
#在实际应用中,我们应该将原始数据集划分为训练集和测试集
#训练集用来训练模型,测试集用来测试模型表现
from sklearn.model_selection import train_test_split
# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
#对不同模型参数,测试模型表现
for max_leaf_nodes in [10,100,200,300]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
Max leaf nodes: 10           Mean Absolute Error:  30616
Max leaf nodes: 100          Mean Absolute Error:  28653
Max leaf nodes: 200          Mean Absolute Error:  30281
Max leaf nodes: 300          Mean Absolute Error:  32188
#下面我们来尝试下复杂一点的随机森林模型
#该模型在默认参数设置下也表现优秀
from sklearn.ensemble import RandomForestRegressor

forest_model=RandomForestRegressor()
forest_model.fit(train_X,train_y)
preds=forest_model.predict(val_X)
print(mean_absolute_error(val_y,preds))
24104.5582648
#下面我们直接使用数据集自带的训练集和测试集来训练模型和测试模型
#the train.csv and test.csv are loading into the process
train=pd.read_csv('dataset/kaggle_house_prices/train.csv')

#pull data into target (y) and predictors (X)
train_y=train.SalePrice
#挑选部分列
predictor_cols=['LotArea','OverallQual','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']

#Create training predictors data
train_X=train[predictor_cols]

my_model=RandomForestRegressor()
#训练随机森林模型
my_model.fit(train_X,train_y)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
#Read the test data 加载测试集数据
test=pd.read_csv('dataset/kaggle_house_prices/test.csv')
#Treat the test data in the same way as training data. In this case,pull same columns.
#这里注意要和训练集挑选的列相同
test_X=test[predictor_cols]
#Use the model to make predictions
#对测试集进行预测
predicted_prices=my_model.predict(test_X)
#We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)
[ 117100.  139600.  158100. ...,  160405.  119220.  235250.]
#新建DataFrame保存预测结果,分别是Id和预测对结果
my_submission=pd.DataFrame({'Id':test.Id,'SalePrice':predicted_prices})

#保存预测结果到新的csv文件中
my_submission.to_csv('output/submission.csv',index=False)
;