def build_graph(self, graph):
np.random.seed(self.random_seed)
with graph.as_default():
tf.set_random_seed(self.random_seed)
# Dims: bs x num_steps x state_size
self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.policy_params['nb_inputs']], name='inputs')
input_shape = tf.shape(self.inputs)
dynamic_batch_size, dynamic_num_steps = input_shape[0], input_shape[1]
policy_scope = tf.VariableScope(reuse=False, name='Policy')
with tf.variable_scope(policy_scope):
policy_inputs = tf.reshape(self.inputs, [-1, self.policy_params['nb_inputs']])
probs, actions = capacities.policy(self.policy_params, policy_inputs)
self.probs = tf.reshape(probs, [dynamic_batch_size, dynamic_num_steps, self.policy_params['nb_outputs']])
self.actions = tf.reshape(actions, [dynamic_batch_size, dynamic_num_steps, 1])
self.action_t = self.actions[0, 0, 0]
with tf.variable_scope('Training'):
self.rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh")
baseline = tf.reduce_mean(self.rewards)
batch_size, num_steps = tf.shape(self.actions)[0], tf.shape(self.actions)[1]
line_indices = tf.matmul( # Line indice
tf.reshape(tf.range(0, batch_size), [-1, 1])
, tf.ones([1, num_steps], dtype=tf.int32)
)
column_indices = tf.matmul( # Column indice
tf.ones([batch_size, 1], dtype=tf.int32)
, tf.reshape(tf.range(0, num_steps), [1, -1])
)
depth_indices = tf.cast(tf.squeeze(self.actions, 2), tf.int32)
stacked_actions = tf.stack(
[line_indices, column_indices, depth_indices], 2
)
log_probs = tf.expand_dims(tf.log(tf.gather_nd(self.probs, stacked_actions)), 2)
# We want to average on sequence
self.loss = tf.reduce_mean( - tf.reduce_sum((log_probs * (self.rewards - baseline)) * self.mask_plh, 1))
adam = tf.train.AdamOptimizer(self.lr)
self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
self.train_op = adam.minimize(self.loss, global_step=self.global_step)
self.score_plh = tf.placeholder(tf.float32, shape=[])
self.score_sum_t = tf.summary.scalar('av_score', self.score_plh)
self.loss_plh = tf.placeholder(tf.float32, shape=[])
self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
self.all_summary_t = tf.summary.merge_all()
self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")
# Playing part
self.pscore_plh = tf.placeholder(tf.float32, shape=[])
self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)
return graph
评论列表
文章目录