DeepDreaming with TensorFlowをやる(2)

2016-09-10 tensorflow

前回の続き。

Multiscale image generation

様々なスケールで勾配上昇させる。小さなスケールで上昇させたものをより大きなスケールでさらに上昇させていく。ただ、壁紙のようなサイズを生成するような場合にそれを行うと、GPUのメモリを食いつぶしてしまう。これを避けるために、画像を小さなタイルに分割し、それぞれ独立に勾配を計算する。また、毎回画像をランダムにシフトしていくことで、タイルに見えることを避け、画像全体の品質を向上させる。

def tffunc(*argtypes):
    '''Helper that transforms TF-graph generating function into a regular one.
    See "resize" function below.
    '''
    placeholders = list(map(tf.placeholder, argtypes))
    def wrap(f):
        out = f(*placeholders)
        def wrapper(*args, **kw):
            return out.eval(dict(zip(placeholders, args)), session=kw.get('session'))
        return wrapper
    return wrap

# Helper function that uses TF to resize an image
def resize(img, size):
    img = tf.expand_dims(img, 0)
    return tf.image.resize_bilinear(img, size)[0,:,:,:]
resize = tffunc(np.float32, np.int32)(resize)


def calc_grad_tiled(img, t_grad, tile_size=512):
    '''Compute the value of tensor t_grad over the image in a tiled way.
    Random shifts are applied to the image to blur tile boundaries over
    multiple iterations.'''
    sz = tile_size
    h, w = img.shape[:2]
    sx, sy = np.random.randint(sz, size=2)
    img_shift = np.roll(np.roll(img, sx, 1), sy, 0)
    grad = np.zeros_like(img)
    for y in range(0, max(h-sz//2, sz),sz):
        for x in range(0, max(w-sz//2, sz),sz):
            sub = img_shift[y:y+sz,x:x+sz]
            g = sess.run(t_grad, {t_input:sub})
            grad[y:y+sz,x:x+sz] = g
    return np.roll(np.roll(grad, -sx, 1), -sy, 0)

tf.image.resize_bilinearは双線形補間によってリサイズする。

numpy.rollは配列を第三引数axisによってローリングする。 axisを指定しない場合、フラットなものとして扱われる。

hoge = [[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]]

print(np.roll(hoge, 1))
# [[8 0 1]
#  [2 3 4]
#  [5 6 7]]

print(np.roll(hoge, 1, axis=0))
# [[6 7 8]
#  [0 1 2]
#  [3 4 5]]

print(np.roll(hoge, 1, axis=1))
# [[2 0 1]
#  [5 3 4]
#  [8 6 7]]

つまり、calc_grad_tiledでは、ランダムにローリングして、タイルに分割して勾配を求め、ローリングした分を戻して返している。これと、画像サイズをoctave_scale倍にしていく以外は前回やったのと基本的に同じだ。

def render_multiscale(t_obj, img0=img_noise, iter_n=10, step=1.0, octave_n=3, octave_scale=1.4):
    t_score = tf.reduce_mean(t_obj) # defining the optimization objective
    t_grad = tf.gradients(t_score, t_input)[0] # behold the power of automatic differentiation!

    img = img0.copy()
    for octave in range(octave_n):
        if octave>0:
            hw = np.float32(img.shape[:2])*octave_scale
            img = resize(img, np.int32(hw))
        for i in range(iter_n):
            g = calc_grad_tiled(img, t_grad)
            # normalizing the gradient, so the same step size should work
            g /= g.std()+1e-8         # for different layers and networks
            img += g*step
            print('.', end = ' ')
        clear_output()
        showarray(visstd(img))

render_multiscale(T(layer)[:,:,:,channel])

Laplacian Pyramid Gradient Normalization

結果の画像は、高い周波数(ピクセルの変化の度合が高い)が多く含まれている。これを改善するための一つの方法として、毎回画像をぼかし、高周波数を抑え、画像を滑らかにするものがある。ただ、この方法は良い画像にするためにより多くの繰り返しが必要になってしまう。逆に、低周波数を上げるのは、ラプラシアンピラミッドを使う方法があって、これで勾配を正規化する。

ラプラシアンピラミッドというのは、ガウシアンピラミッドにおける、ある解像度の画像と、その一つレベルの高い(解像度1/2 * 1/2 = 1/4)画像をアップサンプルしたものの差分だ。

画像ピラミッド — OpenCV-Python Tutorials 1 documentation

k = np.float32([1,4,6,4,1])
k = np.outer(k, k)
# [[  1.   4.   6.   4.   1.]
#  [  4.  16.  24.  16.   4.]
#  [  6.  24.  36.  24.   6.]
#  [  4.  16.  24.  16.   4.]
#  [  1.   4.   6.   4.   1.]]

k5x5 = k[:,:,None,None]/k.sum()*np.eye(3, dtype=np.float32)

print(len(k5x5))
# 5

print(k5x5[0])
# [[[ 0.00390625  0.          0.        ]
#  [ 0.          0.00390625  0.        ]
#  [ 0.          0.          0.00390625]]

# [[ 0.015625    0.          0.        ]
#  [ 0.          0.015625    0.        ]
#  [ 0.          0.          0.015625  ]]

# [[ 0.0234375   0.          0.        ]
#  [ 0.          0.0234375   0.        ]
#  [ 0.          0.          0.0234375 ]]

# [[ 0.015625    0.          0.        ]
#  [ 0.          0.015625    0.        ]
#  [ 0.          0.          0.015625  ]]

# [[ 0.00390625  0.          0.        ]
#  [ 0.          0.00390625  0.        ]
#  [ 0.          0.          0.00390625]]]

numpy.outerは外積を求めるもので、 numpy.eyeは対角線が1で、それ以外は0の2次元行列を返す。 kを指定すると、対角線の位置を変更できるが、指定していない場合はNxNの単位行列が返ることになる。このフィルターで畳み込むことで、ラプラシアンピラミッドの1レベル高い画像に変換できる。 tf.nn.conv2d_transposeは畳み込みの逆処理のようなもので、これでアップサンプルした画像と元画像の差分を取って、ラプラシアンピラミッドを生成している。

def lap_split(img):
    '''Split the image into lo and hi frequency components'''
    with tf.name_scope('split'):
        lo = tf.nn.conv2d(img, k5x5, [1,2,2,1], 'SAME')
        lo2 = tf.nn.conv2d_transpose(lo, k5x5*4, tf.shape(img), [1,2,2,1])
        hi = img-lo2
    return lo, hi

def lap_split_n(img, n):
    '''Build Laplacian pyramid with n splits'''
    levels = []
    for i in range(n):
        img, hi = lap_split(img)
        levels.append(hi)
    levels.append(img)
    return levels[::-1]

def lap_merge(levels):
    '''Merge Laplacian pyramid'''
    img = levels[0]
    for hi in levels[1:]:
        with tf.name_scope('merge'):
            img = tf.nn.conv2d_transpose(img, k5x5*4, tf.shape(hi), [1,2,2,1]) + hi
    return img

def normalize_std(img, eps=1e-10):
    '''Normalize image by making its standard deviation = 1.0'''
    with tf.name_scope('normalize'):
        std = tf.sqrt(tf.reduce_mean(tf.square(img)))
        return img/tf.maximum(std, eps)

def lap_normalize(img, scale_n=4):
    '''Perform the Laplacian pyramid normalization.'''
    img = tf.expand_dims(img,0)
    tlevels = lap_split_n(img, scale_n)
    tlevels = list(map(normalize_std, tlevels))
    out = lap_merge(tlevels)
    return out[0,:,:,:]

lap_normalizeで画像からラプラシアンピラミッドを生成し、それぞれで正規化してからマージして元の画像に戻す処理をしている。

def render_lapnorm(t_obj, img0=img_noise, visfunc=visstd,
                   iter_n=10, step=1.0, octave_n=3, octave_scale=1.4, lap_n=4):
    t_score = tf.reduce_mean(t_obj) # defining the optimization objective
    t_grad = tf.gradients(t_score, t_input)[0] # behold the power of automatic differentiation!
    # build the laplacian normalization graph
    lap_norm_func = tffunc(np.float32)(partial(lap_normalize, scale_n=lap_n))

    img = img0.copy()
    for octave in range(octave_n):
        if octave>0:
            hw = np.float32(img.shape[:2])*octave_scale
            img = resize(img, np.int32(hw))
        for i in range(iter_n):
            g = calc_grad_tiled(img, t_grad)
            g = lap_norm_func(g)
            img += g*step
            print('.', end = ' ')
        clear_output()
        showarray(visfunc(img))

render_lapnorm(T(layer)[:,:,:,channel])

DeepDream

で、これがDeepDreamのアルゴリズム。ラプラシアンピラミッドを生成して、リサイズの際に次のレベルのを足していっている。

def render_deepdream(t_obj, img0=img_noise,
                     iter_n=10, step=1.5, octave_n=4, octave_scale=1.4):
    t_score = tf.reduce_mean(t_obj) # defining the optimization objective
    t_grad = tf.gradients(t_score, t_input)[0] # behold the power of automatic differentiation!

    # split the image into a number of octaves
    img = img0
    octaves = []
    for i in range(octave_n-1):
        hw = img.shape[:2]
        lo = resize(img, np.int32(np.float32(hw)/octave_scale))
        hi = img-resize(lo, hw)
        img = lo
        octaves.append(hi)

    # generate details octave by octave
    for octave in range(octave_n):
        if octave>0:
            hi = octaves[-octave]
            img = resize(img, hi.shape[:2])+hi
        for i in range(iter_n):
            g = calc_grad_tiled(img, t_grad)
            img += g*(step / (np.abs(g).mean()+1e-7))
            print('.',end = ' ')
        clear_output()
        showarray(img/255.0)