loss.py 文件源码-python代码片段

def build_graph(self, actor, critic, cfg):
        self.ph_action = graph.Placeholder(np.float32, shape=(None, actor.action_size), name="ph_action")
        self.ph_advantage = graph.Placeholder(np.float32, shape=(None,), name="ph_adv")
        self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None,), name="ph_edr")

        mu, sigma2 = actor.node
        sigma2 += tf.constant(1e-8)

        log_std_dev = tf.log(sigma2)
        self.entropy = tf.reduce_mean(log_std_dev + tf.constant(0.5 * np.log(2. * np.pi * np.e), tf.float32))

        l2_dist = tf.square(self.ph_action.node - mu)
        sqr_std_dev = tf.constant(2.) * tf.square(sigma2) + tf.constant(1e-6)
        log_std_dev = tf.log(sigma2)
        log_prob = -l2_dist / sqr_std_dev - tf.constant(.5) * tf.log(tf.constant(2 * np.pi)) - log_std_dev

        self.policy_loss = -(tf.reduce_mean(tf.reduce_sum(log_prob, axis=1) * self.ph_advantage.node)
                             + cfg.entropy_beta * self.entropy)

        # Learning rate for the Critic is sized by critic_scale parameter
        self.value_loss = cfg.critic_scale * tf.reduce_mean(tf.square(self.ph_discounted_reward.node - critic.node))