def build_graph(self, actor, critic, cfg):
self.ph_action = graph.Placeholder(np.int32, shape=(None,), name="ph_action")
self.ph_advantage = graph.Placeholder(np.float32, shape=(None,), name="ph_adv")
self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None,), name="ph_edr")
action_one_hot = tf.one_hot(self.ph_action.node, actor.action_size)
# avoid NaN
log_pi = tf.log(tf.maximum(actor.node, 1e-20))
# policy entropy
self.entropy = -tf.reduce_sum(actor.node * log_pi)
# policy loss
self.policy_loss = -(tf.reduce_sum(tf.reduce_sum(log_pi * action_one_hot, axis=1) * self.ph_advantage.node)
+ self.entropy * cfg.entropy_beta)
# value loss
self.value_loss = tf.reduce_sum(tf.square(self.ph_discounted_reward.node - critic.node))
# gradient of policy and value are summed up
# (Learning rate for the Critic is sized by critic_scale parameter)
return self.policy_loss + cfg.critic_scale * self.value_loss
评论列表
文章目录