KaggleのHouse Prices CompetitionをXGBoostで解く

(2019-07-09)

以前TitanicをやったXGBoostでHome Prices Competitionに挑戦する。

KaggleのTitanicのチュートリアルをXGBoostで解く - sambaiz-net

import pandas as pd
df_train = pd.read_csv('house-prices/train.csv')
df_test= pd.read_csv('house-prices/test.csv')

欠損値の処理

以前確認したように欠損値が含まれるので一つずつ見ていって埋めていく。

KaggleのHome Prices CompetitionのKernelからデータの探り方を学ぶ - sambaiz-net

import numpy as np

def fillna(df):
  # PoolQC: Pool quality
  print(np.unique(df['PoolQC'].values.tolist())) # ['Ex' 'Fa' 'Gd' 'nan']
  df["PoolQC"] = df["PoolQC"].fillna("None")

  # MiscFeature: Miscellaneous feature not covered in other categories
  print(np.unique(df['MiscFeature'].values.tolist())) # ['Gar2' 'Othr' 'Shed' 'TenC' 'nan']
  df["MiscFeature"] = df["MiscFeature"].fillna("None")

  # Alley: Type of alley access
  print(np.unique(df['Alley'].values.tolist())) # ['Grvl' 'Pave' 'nan']
  df["Alley"] = df["Alley"].fillna("None")

  # Fence: Fence quality
  print(np.unique(df['Fence'].values.tolist())) # ['GdPrv' 'GdWo' 'MnPrv' 'MnWw' 'nan']
  df["Fence"] = df["Fence"].fillna("None")

  # FireplaceQu: Fireplace quality
  print(np.unique(df['FireplaceQu'].values.tolist())) # ['Ex' 'Fa' 'Gd' 'Po' 'TA' 'nan']
  df["FireplaceQu"] = df["FireplaceQu"].fillna("None")

  # LotFrontage: Linear feet of street connected to property
  print(df["LotFrontage"].describe())
  '''
  mean       70.049958
  std        24.284752
  min        21.000000
  25%        59.000000
  50%        69.000000
  75%        80.000000
  max       313.000000
  '''
  df["LotFrontage"] = df["LotFrontage"].median()

  # GarageCond: Garage condition
  print(np.unique(df["GarageCond"].values.tolist())) # ['Ex' 'Fa' 'Gd' 'Po' 'TA' 'nan']
  df["GarageCond"] = df["GarageCond"].fillna("None")

  # GarageType: Garage location
  print(np.unique(df["GarageType"].values.tolist())) # ['2Types' 'Attchd' 'Basment' 'BuiltIn' 'CarPort' 'Detchd' 'nan']
  df["GarageType"] = df["GarageType"].fillna("None")

  # GarageYrBlt: Year garage was built
  print(np.unique(df["GarageYrBlt"].values.tolist())) # [1900. 1906. ... 2008. 2009.  2010.  nan]
  df["GarageYrBlt"] = df["GarageYrBlt"].fillna(0)

  # GarageFinish: Interior finish of the garage
  print(np.unique(df["GarageFinish"].values.tolist())) # ['Fin' 'RFn' 'Unf' 'nan']
  df["GarageFinish"] = df["GarageFinish"].fillna("None")

  # GarageQual: Garage quality
  print(np.unique(df["GarageQual"].values.tolist())) # ['Ex' 'Fa' 'Gd' 'Po' 'TA' 'nan']
  df["GarageQual"] = df["GarageQual"].fillna("None")

  # BsmtExposure: Walkout or garden level basement walls
  print(np.unique(df["BsmtExposure"].values.tolist())) # ['Av' 'Gd' 'Mn' 'No' 'nan']
  df["BsmtExposure"] = df["BsmtExposure"].fillna("None")

  # BsmtFinType2: Quality of second finished area (if present)
  print(np.unique(df["BsmtFinType2"].values.tolist())) # ['ALQ' 'BLQ' 'GLQ' 'LwQ' 'Rec' 'Unf' 'nan']
  df["BsmtFinType2"] = df["BsmtFinType2"].fillna("None")

  # BsmtFinType1: Quality of basement finished area
  print(np.unique(df["BsmtFinType1"].values.tolist())) # ['ALQ' 'BLQ' 'GLQ' 'LwQ' 'Rec' 'Unf' 'nan']
  df["BsmtFinType1"] = df["BsmtFinType1"].fillna("None")

  # BsmtCond: General condition of the basement
  print(np.unique(df["BsmtCond"].values.tolist())) # ['Fa' 'Gd' 'Po' 'TA' 'nan']
  df["BsmtCond"] = df["BsmtCond"].fillna("None")

  # BsmtQual: Height of the basement
  print(np.unique(df["BsmtQual"].values.tolist())) # ['Ex' 'Fa' 'Gd' 'TA' 'nan']
  df["BsmtQual"] = df["BsmtQual"].fillna("None")

  # MasVnrArea: Masonry veneer area in square feet
  print(df["MasVnrArea"].describe())
  '''
  count     1460.0
  unique     328.0
  top          0.0
  freq       861.0
  '''
  df["MasVnrArea"] = df["MasVnrArea"].fillna(0)

  # MasVnrType: Masonry veneer type
  print(np.unique(df["MasVnrType"].values.tolist())) # ['BrkCmn' 'BrkFace' 'None' 'Stone' 'nan']
  print(df["MasVnrType"].describe())
  '''
  count     1452
  unique       4
  top       None
  freq       864
  '''
  df["MasVnrType"] = df["MasVnrType"].fillna("None")

  # Electrical: Electrical system
  print(np.unique(df["Electrical"].values.tolist())) # ['FuseA' 'FuseF' 'FuseP' 'Mix' 'SBrkr' 'nan']
  df["Electrical"] = df["Electrical"].fillna("None")
  return df

ハイパーパラメータの最適化

ベイズ最適化でハイパーパラメータを決める。 他にもいろいろなパラメータがあるが、やみくもに増やしても提出した後のスコアが良くならなかった。

ベイズ最適化でランダムフォレストとXGBoostの良いハイパーパラメータを探す - sambaiz-net

! pip install bayesian-optimization

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization

num_boost_round=500

def optimize_params(df):
  def train(
      learning_rate, 
      colsample_bytree,
      sub_sample):
    train_x = df.drop('SalePrice', axis=1)
    train_y = df.SalePrice
    (train_x, test_x ,train_y, test_y) = train_test_split(train_x, train_y, test_size = 0.3)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    param = {
        'learning_rate': learning_rate, 
        'colsample_bytree': colsample_bytree,
        'sub_sample': sub_sample}
    bst = xgb.train(param, dtrain, num_boost_round)
    preds = bst.predict(xgb.DMatrix(test_x))
    # Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price
    return -np.sqrt(mean_squared_error(
        np.log(np.clip(np.nan_to_num(test_y), 1e-6, None)),
        np.log(np.clip(np.nan_to_num(preds), 1e-6, None))
    ))
  bo = BayesianOptimization(
    train,
    {'learning_rate': (0.01, 0.5), # default=0.3
     'colsample_bytree': (0.1, 1.0),  # default=1
     'sub_sample': (0.1, 1.0), # default=1
    })
  bo.maximize(n_iter=50, alpha=1e-5)
  return bo.max['params']

params = optimize_params(df_train_filled)

結果こんな感じ。

print(params)
# => {'colsample_bytree': 0.485528388652188, 'learning_rate': 0.010027583064377935, 'sub_sample': 0.10000002653848854}

実行

pd.get_dummies()でカテゴリカル変数をone hot vectorに変換し、学習して予測結果を出力する。

カテゴリカル変数をLabel/OneHotEncoderやget_dummiesで変換する - sambaiz-net

df_train_filled = fillna(df_train)
df_train_filled = pd.get_dummies(df_train_filled)
params = optimize_params(df_train_filled)
bst = xgb.train(
    params, 
    xgb.DMatrix(df_train_filled.drop('SalePrice', axis=1).as_matrix(), label=df_train_filled.SalePrice),
    num_boost_round)

df_test_filled = fillna(df_test)
df_test_filled = pd.get_dummies(df_test_filled)
for miss_key in [key for key in df_train_filled.drop('SalePrice', axis=1).keys() if key not in df_test]:
  df_test_filled[miss_key] = 0 
preds = bst.predict(xgb.DMatrix(df_test_filled.as_matrix()))
submit_data =  pd.Series(preds, name='SalePrice', index=df_test['Id'])
submit_data.to_csv('submit.csv', header=True)

スコアは 0.16164 だった。学習時のスコアは 0.11 ほどだったのにかなり下がってしまった。