def _create_train(self):
with tf.variable_scope(self.scope):
self.actions = tf.placeholder(
shape=[None, self.action_size], dtype=tf.float32,
name='actions')
self.target_v = tf.placeholder(
shape=[None], dtype=tf.float32, name='target_v')
self.advantages = tf.placeholder(
shape=[None], dtype=tf.float32, name='advantages')
# Determine the policy loss using the actions and the advantage
log_prob = self.normal_dist.log_prob(self.actions)
exp_v = tf.transpose(
tf.multiply(tf.transpose(log_prob), self.advantages))
entropy = self.normal_dist.entropy()
exp_v = 0.01 * entropy + exp_v
self.policy_loss = tf.reduce_sum(-exp_v)
self.value_loss = 0.5 * tf.reduce_sum(
tf.square(self.target_v - tf.reshape(self.value, [-1])))
self.loss = 0.5*self.value_loss + self.policy_loss
local_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
self.gradients = tf.gradients(self.loss, local_vars)
self.var_norms = tf.global_norm(local_vars)
grads, self.grad_norms = tf.clip_by_global_norm(
self.gradients, 40.0)
global_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
self.apply_grads = self.trainer.apply_gradients(
zip(grads, global_vars))
评论列表
文章目录