loss.py 文件源码-python代码片段

def build_graph(self, actor, critic, cfg):
        self.ph_action = graph.Placeholder(np.float32, shape=(None, actor.action_size), name="ph_action")
        self.ph_advantage = graph.Placeholder(np.float32, shape=(None,), name="ph_adv")
        self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None,), name="ph_edr")

        mu, sigma2 = actor.node
        sigma2 += tf.constant(1e-8)

        # policy entropy
        self.entropy = -tf.reduce_mean(0.5 * (tf.log(2. * np.pi * sigma2) + 1.))

        # policy loss (calculation)
        b_size = tf.to_float(tf.size(self.ph_action.node) / actor.action_size)
        log_pi = tf.log(sigma2)
        x_prec = tf.exp(-log_pi)
        x_diff = tf.subtract(self.ph_action.node, mu)
        x_power = tf.square(x_diff) * x_prec * -0.5
        gaussian_nll = (tf.reduce_sum(log_pi, axis=1)
                        + b_size * tf.log(2. * np.pi)) / 2. - tf.reduce_sum(x_power, axis=1)

        self.policy_loss = -(tf.reduce_mean(gaussian_nll * self.ph_advantage.node) + cfg.entropy_beta * self.entropy)

        # value loss
        # (Learning rate for the Critic is sized by critic_scale parameter)
        self.value_loss = cfg.critic_scale * tf.reduce_mean(tf.square(self.ph_discounted_reward.node - critic.node))