networks.py 文件源码-python代码片段

def _create_train(self):
        with tf.variable_scope(self.scope):
            self.actions = tf.placeholder(
                shape=[None, self.action_size], dtype=tf.float32,
                name='actions')
            self.target_v = tf.placeholder(
                shape=[None], dtype=tf.float32, name='target_v')
            self.advantages = tf.placeholder(
                shape=[None], dtype=tf.float32, name='advantages')

            # Determine the policy loss using the actions and the advantage
            log_prob = self.normal_dist.log_prob(self.actions)
            exp_v = tf.transpose(
                tf.multiply(tf.transpose(log_prob), self.advantages))
            entropy = self.normal_dist.entropy()
            exp_v = 0.01 * entropy + exp_v
            self.policy_loss = tf.reduce_sum(-exp_v)

            self.value_loss = 0.5 * tf.reduce_sum(
                tf.square(self.target_v - tf.reshape(self.value, [-1])))

            self.loss = 0.5*self.value_loss + self.policy_loss

            local_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

            self.gradients = tf.gradients(self.loss, local_vars)
            self.var_norms = tf.global_norm(local_vars)

            grads, self.grad_norms = tf.clip_by_global_norm(
                self.gradients, 40.0)

            global_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
            self.apply_grads = self.trainer.apply_gradients(
                zip(grads, global_vars))