def init_ops_for_training(self, target_critic):
# update critic using bellman equation; Q(s1, a) = reward + discount * Q(s2, A(s2))
# left hand side of bellman is just q_value, but let's be explicit about it...
bellman_lhs = self.q_value
# right hand side is ...
# = reward + discounted q value from target actor & critic in the non terminal case
# = reward # in the terminal case
self.reward = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="critic_reward")
self.terminal_mask = tf.placeholder(shape=[None, 1], dtype=tf.float32,
name="critic_terminal_mask")
self.input_state_2 = target_critic.input_state
bellman_rhs = self.reward + (self.terminal_mask * opts.discount * target_critic.q_value)
# note: since we are NOT training target networks we stop gradients flowing to them
bellman_rhs = tf.stop_gradient(bellman_rhs)
# the value we are trying to mimimise is the difference between these two; the
# temporal difference we use a squared loss for optimisation and, as for actor, we
# wrap optimiser in a namespace so it's not picked up by target network variable
# handling.
self.temporal_difference = bellman_lhs - bellman_rhs
self.temporal_difference_loss = tf.reduce_mean(tf.pow(self.temporal_difference, 2))
# self.temporal_difference_loss = tf.Print(self.temporal_difference_loss, [self.temporal_difference_loss], 'temporal_difference_loss')
with tf.variable_scope("optimiser"):
# calc gradients
optimiser = tf.train.GradientDescentOptimizer(opts.critic_learning_rate)
gradients = optimiser.compute_gradients(self.temporal_difference_loss)
# potentially clip and wrap with debugging tf.Print
gradients = util.clip_and_debug_gradients(gradients, opts)
# apply
self.train_op = optimiser.apply_gradients(gradients)
评论列表
文章目录