def init_ops_for_training(self, critic):
# actors gradients are the gradients for it's output w.r.t it's vars using initial
# gradients provided by critic. this requires that critic was init'd with an
# input_action = actor.output_action (which is natural anyway)
# we wrap the optimiser in namespace since we don't want this as part of copy to
# target networks.
# note that we negate the gradients from critic since we are trying to maximise
# the q values (not minimise like a loss)
with tf.variable_scope("optimiser"):
gradients = tf.gradients(self.output_action,
self.trainable_model_vars(),
tf.neg(critic.q_gradients_wrt_actions()))
gradients = zip(gradients, self.trainable_model_vars())
# potentially clip and wrap with debugging
gradients = util.clip_and_debug_gradients(gradients, opts)
# apply
optimiser = tf.train.GradientDescentOptimizer(opts.actor_learning_rate)
self.train_op = optimiser.apply_gradients(gradients)
评论列表
文章目录