def __init__(self, name, inputs, conv_outputs, reward_scaling, config):
with tf.variable_scope(name):
hidden = tf.layers.dense(conv_outputs, 256, tf.nn.relu, name='hidden')
value = tf.layers.dense(hidden, 1)
self.value = tf.squeeze(
inputs.alive * reward_scaling.unnormalize_output(value),
axis=1,
name='value')
actions = tf.layers.dense(hidden, config.num_actions, name='actions')
self.policy = tf.nn.softmax(actions, name='policy')
self.log_policy = tf.nn.log_softmax(actions, name='log_policy')
# Sample action from policy
self.greedy_action = tf.squeeze(
tf.multinomial(self.log_policy, num_samples=1),
axis=1,
name='greedy_action')
评论列表
文章目录