TensorFlow2のKeras APIでTitanicのモデルを作る

2020-08-08 python tensorflow machinelearning

データセット

$ pip install tensorflow-datasets

tfds.load()して tf.data.Datasetを作る。 tf.data はCPUやGPUがなるべくアイドル状態にならないようにする効率的な入力パイプラインを構築するAPI。

TensorFlowのtf.data API - sambaiz-net

import tensorflow as tf
import tensorflow_datasets as tfds

ds_train = tfds.load('titanic', split='train', shuffle_files=True)
print(ds_train.element_spec)
'''
{'features': {'age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'boat': TensorSpec(shape=(), dtype=tf.string, name=None), 'body': TensorSpec(shape=(), dtype=tf.int32, name=None), 'cabin': TensorSpec(shape=(), dtype=tf.string, name=None), 'embarked': TensorSpec(shape=(), dtype=tf.int64, name=None), 'fare': TensorSpec(shape=(), dtype=tf.float32, name=None), 'home.dest': TensorSpec(shape=(), dtype=tf.string, name=None), 'name': TensorSpec(shape=(), dtype=tf.string, name=None), 'parch': TensorSpec(shape=(), dtype=tf.int32, name=None), 'pclass': TensorSpec(shape=(), dtype=tf.int64, name=None), 'sex': TensorSpec(shape=(), dtype=tf.int64, name=None), 'sibsp': TensorSpec(shape=(), dtype=tf.int32, name=None), 'ticket': TensorSpec(shape=(), dtype=tf.string, name=None)}, 'survived': TensorSpec(shape=(), dtype=tf.int64, name=None)}
'''

データを1000個取ってshuffle()して 10個入りのbatch()を 2セットprefetch()しておき、 1回take()して中身を表示してみる。

ds_train = ds_train.shuffle(1000).batch(10).prefetch(2)
for data in ds_train.take(1):
    print(data['features']['age'])
    # tf.Tensor([22.    0.75 -1.   28.   43.   -1.   29.   33.   39.   17.  ], shape=(10,), dtype=float32)

    print(data['features']['sex'])
    # tf.Tensor([0 1 0 1 1 1 0 1 1 1], shape=(10,), dtype=int64)
    
    print(data['features']['fare'])
    # tf.Tensor([ 7.25   19.2583  7.75   12.65   55.4417  7.8792 21.     15.85   55.9   12.    ], shape=(10,), dtype=float32)

ageに-1の欠損値があるようなので平均値に書き換える。

def fillmiss(feature: str, value):
    def _fillmiss(x):
        if x['features'][feature] == -1.0:
            x['features'][feature] = value
        return x

    return _fillmiss

ds_size = len(list(ds_train.filter(lambda x: x['features']['age'] != -1.0)))
avg_age = ds_train.filter(lambda x: x['features']['age'] != -1.0).reduce(0.0, lambda x, y: x + y['features']['age']) / ds_size
ds_train = ds_train.map(fillmiss('age', avg_age))

モデル

tf.keras.Sequentialで適当にレイヤーを詰んだ tf.keras.Modelを作り、 Optimizerと損失関数を渡してcompile()する。

def model(feature_columns):
    model = tf.keras.Sequential([
        tf.keras.layers.DenseFeatures(feature_columns),
        tf.keras.layers.Dense(128, name="dense1", activation=tf.nn.relu),
        tf.keras.layers.Dropout(dropout, name="dropout"),
        tf.keras.layers.Dense(128, name="dense2", activation=tf.nn.relu),
        tf.keras.layers.Dense(2, name="output", activation=tf.nn.sigmoid),
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

用いるカラムを選んでDenseFeaturesに渡す。 ageとfareはそのまま数値として扱うが、sexは[0, 1]のカテゴリーなので、その値をカテゴリーIDとして indicator_column()でone-hot vectorとして扱われるようにする。 fit()でパラメータを更新し、evaluate()で損失とmetricsを取得でき、predict()で推論できる。また、summary()でレイヤーやそのパラメータ数を表示できる。

m = model([
    tf.feature_column.numeric_column('age'),
    tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_identity('sex', 2)),
    tf.feature_column.numeric_column('fare')
])
for features in ds_train.take(100):
    m.fit(features['features'], tf.one_hot(features['survived'], 2))
    m.summary()
    loss, accuracy = m.evaluate(features['features'], tf.one_hot(features['survived'], 2), verbose=0)
    print(f'loss={loss}, accuracy={accuracy}')
    predicted = m.predict(features['features'], verbose=0)
    print(tf.argmax(predicted, axis=1))

...
4/4 [==============================] - 0s 3ms/step - loss: 0.8360 - accuracy: 0.6500
loss=0.5920160412788391, accuracy=0.7099999785423279
...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_features (DenseFeature multiple                  0         
_________________________________________________________________
dense1 (Dense)               multiple                  640       
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
dense2 (Dense)               multiple                  16512     
_________________________________________________________________
output (Dense)               multiple                  258       
=================================================================
Total params: 17,410
Trainable params: 17,410
Non-trainable params: 0
_________________________________________________________________
...
tf.Tensor(
[0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1], shape=(100,), dtype=int64)
1/1 [==============================] - 0s 647us/step - loss: 0.6114 - accuracy: 0.6667
loss=0.41255539655685425, accuracy=0.7777777910232544
tf.Tensor([0 0 0 0 0 0 0 0 1], shape=(9,), dtype=int64)