# KaggleのHouse Prices CompetitionをXGBoostで解く

(2019-07-09)

KaggleのTitanicのチュートリアルをXGBoostで解く - sambaiz-net

``````import pandas as pd
``````

## 欠損値の処理

KaggleのHome Prices CompetitionのKernelからデータの探り方を学ぶ - sambaiz-net

``````import numpy as np

def fillna(df):
# PoolQC: Pool quality
print(np.unique(df['PoolQC'].values.tolist())) # ['Ex' 'Fa' 'Gd' 'nan']
df["PoolQC"] = df["PoolQC"].fillna("None")

# MiscFeature: Miscellaneous feature not covered in other categories
print(np.unique(df['MiscFeature'].values.tolist())) # ['Gar2' 'Othr' 'Shed' 'TenC' 'nan']
df["MiscFeature"] = df["MiscFeature"].fillna("None")

# Alley: Type of alley access
print(np.unique(df['Alley'].values.tolist())) # ['Grvl' 'Pave' 'nan']
df["Alley"] = df["Alley"].fillna("None")

# Fence: Fence quality
print(np.unique(df['Fence'].values.tolist())) # ['GdPrv' 'GdWo' 'MnPrv' 'MnWw' 'nan']
df["Fence"] = df["Fence"].fillna("None")

# FireplaceQu: Fireplace quality
print(np.unique(df['FireplaceQu'].values.tolist())) # ['Ex' 'Fa' 'Gd' 'Po' 'TA' 'nan']
df["FireplaceQu"] = df["FireplaceQu"].fillna("None")

# LotFrontage: Linear feet of street connected to property
print(df["LotFrontage"].describe())
'''
mean       70.049958
std        24.284752
min        21.000000
25%        59.000000
50%        69.000000
75%        80.000000
max       313.000000
'''
df["LotFrontage"] = df["LotFrontage"].median()

# GarageCond: Garage condition
print(np.unique(df["GarageCond"].values.tolist())) # ['Ex' 'Fa' 'Gd' 'Po' 'TA' 'nan']
df["GarageCond"] = df["GarageCond"].fillna("None")

# GarageType: Garage location
print(np.unique(df["GarageType"].values.tolist())) # ['2Types' 'Attchd' 'Basment' 'BuiltIn' 'CarPort' 'Detchd' 'nan']
df["GarageType"] = df["GarageType"].fillna("None")

# GarageYrBlt: Year garage was built
print(np.unique(df["GarageYrBlt"].values.tolist())) # [1900. 1906. ... 2008. 2009.  2010.  nan]
df["GarageYrBlt"] = df["GarageYrBlt"].fillna(0)

# GarageFinish: Interior finish of the garage
print(np.unique(df["GarageFinish"].values.tolist())) # ['Fin' 'RFn' 'Unf' 'nan']
df["GarageFinish"] = df["GarageFinish"].fillna("None")

# GarageQual: Garage quality
print(np.unique(df["GarageQual"].values.tolist())) # ['Ex' 'Fa' 'Gd' 'Po' 'TA' 'nan']
df["GarageQual"] = df["GarageQual"].fillna("None")

# BsmtExposure: Walkout or garden level basement walls
print(np.unique(df["BsmtExposure"].values.tolist())) # ['Av' 'Gd' 'Mn' 'No' 'nan']
df["BsmtExposure"] = df["BsmtExposure"].fillna("None")

# BsmtFinType2: Quality of second finished area (if present)
print(np.unique(df["BsmtFinType2"].values.tolist())) # ['ALQ' 'BLQ' 'GLQ' 'LwQ' 'Rec' 'Unf' 'nan']
df["BsmtFinType2"] = df["BsmtFinType2"].fillna("None")

# BsmtFinType1: Quality of basement finished area
print(np.unique(df["BsmtFinType1"].values.tolist())) # ['ALQ' 'BLQ' 'GLQ' 'LwQ' 'Rec' 'Unf' 'nan']
df["BsmtFinType1"] = df["BsmtFinType1"].fillna("None")

# BsmtCond: General condition of the basement
print(np.unique(df["BsmtCond"].values.tolist())) # ['Fa' 'Gd' 'Po' 'TA' 'nan']
df["BsmtCond"] = df["BsmtCond"].fillna("None")

# BsmtQual: Height of the basement
print(np.unique(df["BsmtQual"].values.tolist())) # ['Ex' 'Fa' 'Gd' 'TA' 'nan']
df["BsmtQual"] = df["BsmtQual"].fillna("None")

# MasVnrArea: Masonry veneer area in square feet
print(df["MasVnrArea"].describe())
'''
count     1460.0
unique     328.0
top          0.0
freq       861.0
'''
df["MasVnrArea"] = df["MasVnrArea"].fillna(0)

# MasVnrType: Masonry veneer type
print(np.unique(df["MasVnrType"].values.tolist())) # ['BrkCmn' 'BrkFace' 'None' 'Stone' 'nan']
print(df["MasVnrType"].describe())
'''
count     1452
unique       4
top       None
freq       864
'''
df["MasVnrType"] = df["MasVnrType"].fillna("None")

# Electrical: Electrical system
print(np.unique(df["Electrical"].values.tolist())) # ['FuseA' 'FuseF' 'FuseP' 'Mix' 'SBrkr' 'nan']
df["Electrical"] = df["Electrical"].fillna("None")
return df
``````

## ハイパーパラメータの最適化

ベイズ最適化でハイパーパラメータを決める。 他にもいろいろなパラメータがあるが、やみくもに増やしても提出した後のスコアが良くならなかった。

ベイズ最適化でランダムフォレストとXGBoostの良いハイパーパラメータを探す - sambaiz-net

``````! pip install bayesian-optimization

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization

num_boost_round=500

def optimize_params(df):
def train(
learning_rate,
colsample_bytree,
sub_sample):
train_x = df.drop('SalePrice', axis=1)
train_y = df.SalePrice
(train_x, test_x ,train_y, test_y) = train_test_split(train_x, train_y, test_size = 0.3)
dtrain = xgb.DMatrix(train_x, label=train_y)
param = {
'learning_rate': learning_rate,
'colsample_bytree': colsample_bytree,
'sub_sample': sub_sample}
bst = xgb.train(param, dtrain, num_boost_round)
preds = bst.predict(xgb.DMatrix(test_x))
# Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price
return -np.sqrt(mean_squared_error(
np.log(np.clip(np.nan_to_num(test_y), 1e-6, None)),
np.log(np.clip(np.nan_to_num(preds), 1e-6, None))
))
bo = BayesianOptimization(
train,
{'learning_rate': (0.01, 0.5), # default=0.3
'colsample_bytree': (0.1, 1.0),  # default=1
'sub_sample': (0.1, 1.0), # default=1
})
bo.maximize(n_iter=50, alpha=1e-5)
return bo.max['params']

params = optimize_params(df_train_filled)
``````

``````print(params)
# => {'colsample_bytree': 0.485528388652188, 'learning_rate': 0.010027583064377935, 'sub_sample': 0.10000002653848854}
``````

## 実行

`pd.get_dummies()`でカテゴリカル変数をone hot vectorに変換し、学習して予測結果を出力する。

カテゴリカル変数をLabel/OneHotEncoderやget_dummiesで変換する - sambaiz-net

``````df_train_filled = fillna(df_train)
df_train_filled = pd.get_dummies(df_train_filled)
params = optimize_params(df_train_filled)
bst = xgb.train(
params,
xgb.DMatrix(df_train_filled.drop('SalePrice', axis=1).as_matrix(), label=df_train_filled.SalePrice),
num_boost_round)

df_test_filled = fillna(df_test)
df_test_filled = pd.get_dummies(df_test_filled)
for miss_key in [key for key in df_train_filled.drop('SalePrice', axis=1).keys() if key not in df_test]:
df_test_filled[miss_key] = 0
preds = bst.predict(xgb.DMatrix(df_test_filled.as_matrix()))
submit_data =  pd.Series(preds, name='SalePrice', index=df_test['Id'])
スコアは `0.16164` だった。学習時のスコアは `0.11` ほどだったのにかなり下がってしまった。