loss.py 文件源码

python
阅读 25 收藏 0 点赞 0 评论 0

项目:relaax 作者: deeplearninc 项目源码 文件源码
def build_graph(self, actor, critic, cfg):
        self.ph_action = graph.Placeholder(np.float32, shape=(None, actor.action_size), name="ph_action")
        self.ph_advantage = graph.Placeholder(np.float32, shape=(None,), name="ph_adv")
        self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None,), name="ph_edr")

        mu, sigma2 = actor.node
        sigma2 += tf.constant(1e-8)

        # policy entropy
        self.entropy = -tf.reduce_mean(0.5 * (tf.log(2. * np.pi * sigma2) + 1.))

        # policy loss (calculation)
        b_size = tf.to_float(tf.size(self.ph_action.node) / actor.action_size)
        log_pi = tf.log(sigma2)
        x_prec = tf.exp(-log_pi)
        x_diff = tf.subtract(self.ph_action.node, mu)
        x_power = tf.square(x_diff) * x_prec * -0.5
        gaussian_nll = (tf.reduce_sum(log_pi, axis=1)
                        + b_size * tf.log(2. * np.pi)) / 2. - tf.reduce_sum(x_power, axis=1)

        self.policy_loss = -(tf.reduce_mean(gaussian_nll * self.ph_advantage.node) + cfg.entropy_beta * self.entropy)

        # value loss
        # (Learning rate for the Critic is sized by critic_scale parameter)
        self.value_loss = cfg.critic_scale * tf.reduce_mean(tf.square(self.ph_discounted_reward.node - critic.node))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号