def _create_optimizer(self, args):
# Find negagtive log-likelihood of true actions
std_a = tf.exp(self.a_logstd)
pl_1 = 0.5 * tf.to_float(args.action_dim) * np.log(2. * np.pi)
pl_2 = tf.to_float(args.action_dim) * tf.reduce_sum(tf.log(std_a))
pl_3 = 0.5 * \
tf.reduce_mean(tf.reduce_sum(
tf.square((self.targets - self.a_mean) / std_a), 1))
policy_loss = pl_1 + pl_2 + pl_3
# Find overall loss
self.cost = policy_loss
self.summary_policy = tf.scalar_summary(
"Policy loss", tf.reduce_mean(policy_loss))
# Perform parameter update
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(
tf.gradients(self.cost, tvars), args.grad_clip)
optimizer = tf.train.AdamOptimizer(self.learning_rate)
self.train = optimizer.apply_gradients(zip(grads, tvars))
评论列表
文章目录