def build_model(self):
sc = predictron_arg_scope()
with tf.variable_scope('state'):
with slim.arg_scope(sc):
state = slim.conv2d(self.inputs, 32, [3, 3], scope='conv1')
state = layers.batch_norm(state, activation_fn=tf.nn.relu, scope='conv1/preact')
state = slim.conv2d(state, 32, [3, 3], scope='conv2')
state = layers.batch_norm(state, activation_fn=tf.nn.relu, scope='conv2/preact')
iter_template = tf.make_template('iter', self.iter_func, unique_name_='iter')
rewards_arr = []
gammas_arr = []
lambdas_arr = []
values_arr = []
for k in range(self.max_depth):
state, reward, gamma, lambda_, value = iter_template(state)
rewards_arr.append(reward)
gammas_arr.append(gamma)
lambdas_arr.append(lambda_)
values_arr.append(value)
_, _, _, _, value = iter_template(state)
# K + 1 elements
values_arr.append(value)
bs = tf.shape(self.inputs)[0]
# [batch_size, K * maze_size]
self.rewards = tf.pack(rewards_arr, axis=1)
# [batch_size, K, maze_size]
self.rewards = tf.reshape(self.rewards, [bs, self.max_depth, self.maze_size])
# [batch_size, K + 1, maze_size]
self.rewards = tf.concat_v2(values=[tf.zeros(shape=[bs, 1, self.maze_size], dtype=tf.float32), self.rewards],
axis=1, name='rewards')
# [batch_size, K * maze_size]
self.gammas = tf.pack(gammas_arr, axis=1)
# [batch_size, K, maze_size]
self.gammas = tf.reshape(self.gammas, [bs, self.max_depth, self.maze_size])
# [batch_size, K + 1, maze_size]
self.gammas = tf.concat_v2(values=[tf.ones(shape=[bs, 1, self.maze_size], dtype=tf.float32), self.gammas],
axis=1, name='gammas')
# [batch_size, K * maze_size]
self.lambdas = tf.pack(lambdas_arr, axis=1)
# [batch_size, K, maze_size]
self.lambdas = tf.reshape(self.lambdas, [-1, self.max_depth, self.maze_size])
# [batch_size, (K + 1) * maze_size]
self.values = tf.pack(values_arr, axis=1)
# [batch_size, K + 1, maze_size]
self.values = tf.reshape(self.values, [-1, (self.max_depth + 1), self.maze_size])
self.build_preturns()
self.build_lambda_preturns()
评论列表
文章目录