机器学习笔记
摘自kaggle初学者教程
所用数据集为一个房价预测的数据集
import pandas as pd
main_file_path = 'dataset/kaggle_house_prices/train.csv'
data = pd.read_csv(main_file_path)
#删除数据集中的缺失值
# data=data.dropna(axis=0)
#显示数据集前几行
print(data.head(4))
#对数据集做简单的探索
print(data.describe())
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
0 1 60 RL 65.0 8450 Pave NaN Reg
1 2 20 RL 80.0 9600 Pave NaN Reg
2 3 60 RL 68.0 11250 Pave NaN IR1
3 4 70 RL 60.0 9550 Pave NaN IR1
LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \
0 Lvl AllPub ... 0 NaN NaN NaN 0
1 Lvl AllPub ... 0 NaN NaN NaN 0
2 Lvl AllPub ... 0 NaN NaN NaN 0
3 Lvl AllPub ... 0 NaN NaN NaN 0
MoSold YrSold SaleType SaleCondition SalePrice
0 2 2008 WD Normal 208500
1 5 2007 WD Normal 181500
2 9 2008 WD Normal 223500
3 2 2006 WD Abnorml 140000
[4 rows x 81 columns]
Id MSSubClass LotFrontage LotArea OverallQual \
count 1460.000000 1460.000000 1201.000000 1460.000000 1460.000000
mean 730.500000 56.897260 70.049958 10516.828082 6.099315
std 421.610009 42.300571 24.284752 9981.264932 1.382997
min 1.000000 20.000000 21.000000 1300.000000 1.000000
25% 365.750000 20.000000 59.000000 7553.500000 5.000000
50% 730.500000 50.000000 69.000000 9478.500000 6.000000
75% 1095.250000 70.000000 80.000000 11601.500000 7.000000
max 1460.000000 190.000000 313.000000 215245.000000 10.000000
OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 \
count 1460.000000 1460.000000 1460.000000 1452.000000 1460.000000
mean 5.575342 1971.267808 1984.865753 103.685262 443.639726
std 1.112799 30.202904 20.645407 181.066207 456.098091
min 1.000000 1872.000000 1950.000000 0.000000 0.000000
25% 5.000000 1954.000000 1967.000000 0.000000 0.000000
50% 5.000000 1973.000000 1994.000000 0.000000 383.500000
75% 6.000000 2000.000000 2004.000000 166.000000 712.250000
max 9.000000 2010.000000 2010.000000 1600.000000 5644.000000
... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \
count ... 1460.000000 1460.000000 1460.000000 1460.000000
mean ... 94.244521 46.660274 21.954110 3.409589
std ... 125.338794 66.256028 61.119149 29.317331
min ... 0.000000 0.000000 0.000000 0.000000
25% ... 0.000000 0.000000 0.000000 0.000000
50% ... 0.000000 25.000000 0.000000 0.000000
75% ... 168.000000 68.000000 0.000000 0.000000
max ... 857.000000 547.000000 552.000000 508.000000
ScreenPorch PoolArea MiscVal MoSold YrSold \
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 15.060959 2.758904 43.489041 6.321918 2007.815753
std 55.757415 40.177307 496.123024 2.703626 1.328095
min 0.000000 0.000000 0.000000 1.000000 2006.000000
25% 0.000000 0.000000 0.000000 5.000000 2007.000000
50% 0.000000 0.000000 0.000000 6.000000 2008.000000
75% 0.000000 0.000000 0.000000 8.000000 2009.000000
max 480.000000 738.000000 15500.000000 12.000000 2010.000000
SalePrice
count 1460.000000
mean 180921.195890
std 79442.502883
min 34900.000000
25% 129975.000000
50% 163000.000000
75% 214000.000000
max 755000.000000
[8 rows x 38 columns]
#显示数据集中的所有列名
print(data.columns)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice'],
dtype='object')
#将要预测的房价列取出
salePrice=data.SalePrice
print(salePrice.head())
0 208500
1 181500
2 223500
3 140000
4 250000
Name: SalePrice, dtype: int64
#选择数据集中的部分列
columns=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']
X=data[columns]
y=data.SalePrice
from sklearn.tree import DecisionTreeRegressor
#Define model 定义简单的决策树模型
decision_model=DecisionTreeRegressor()
#Fit model 训练模型
decision_model.fit(X,y)
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
#取前5行数据进行预测,并对比预测值和真实值
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(decision_model.predict(X.head()))
print("The real y is:")
print(y.head())
Making predictions for the following 5 houses:
LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr \
0 8450 2003 856 854 2 3
1 9600 1976 1262 0 2 3
2 11250 2001 920 866 2 3
3 9550 1915 961 756 1 3
4 14260 2000 1145 1053 2 4
TotRmsAbvGrd
0 8
1 6
2 6
3 7
4 9
The predictions are
[ 208500. 181500. 223500. 140000. 250000.]
The real y is:
0 208500
1 181500
2 223500
3 140000
4 250000
Name: SalePrice, dtype: int64
#使用MAE度量模型的误差
from sklearn.metrics import mean_absolute_error
#决策树模型的参数max_leaf_nodes会影响模型表现,自定义函数探究该变量
def get_mae(max_leaf_nodes, train_X, val_X,train_y,val_y):
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
model.fit(train_X, train_y)
preds_val = model.predict(val_X)
mae = mean_absolute_error(val_y, preds_val)
return(mae)
#在实际应用中,我们应该将原始数据集划分为训练集和测试集
#训练集用来训练模型,测试集用来测试模型表现
from sklearn.model_selection import train_test_split
# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
#对不同模型参数,测试模型表现
for max_leaf_nodes in [10,100,200,300]:
my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))
Max leaf nodes: 10 Mean Absolute Error: 30616
Max leaf nodes: 100 Mean Absolute Error: 28653
Max leaf nodes: 200 Mean Absolute Error: 30281
Max leaf nodes: 300 Mean Absolute Error: 32188
#下面我们来尝试下复杂一点的随机森林模型
#该模型在默认参数设置下也表现优秀
from sklearn.ensemble import RandomForestRegressor
forest_model=RandomForestRegressor()
forest_model.fit(train_X,train_y)
preds=forest_model.predict(val_X)
print(mean_absolute_error(val_y,preds))
24104.5582648
#下面我们直接使用数据集自带的训练集和测试集来训练模型和测试模型
#the train.csv and test.csv are loading into the process
train=pd.read_csv('dataset/kaggle_house_prices/train.csv')
#pull data into target (y) and predictors (X)
train_y=train.SalePrice
#挑选部分列
predictor_cols=['LotArea','OverallQual','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']
#Create training predictors data
train_X=train[predictor_cols]
my_model=RandomForestRegressor()
#训练随机森林模型
my_model.fit(train_X,train_y)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0, warm_start=False)
#Read the test data 加载测试集数据
test=pd.read_csv('dataset/kaggle_house_prices/test.csv')
#Treat the test data in the same way as training data. In this case,pull same columns.
#这里注意要和训练集挑选的列相同
test_X=test[predictor_cols]
#Use the model to make predictions
#对测试集进行预测
predicted_prices=my_model.predict(test_X)
#We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)
[ 117100. 139600. 158100. ..., 160405. 119220. 235250.]
#新建DataFrame保存预测结果,分别是Id和预测对结果
my_submission=pd.DataFrame({'Id':test.Id,'SalePrice':predicted_prices})
#保存预测结果到新的csv文件中
my_submission.to_csv('output/submission.csv',index=False)