def build_graph(self, actor, critic, cfg):
self.ph_action = graph.Placeholder(np.float32, shape=(None, actor.action_size), name="ph_action")
self.ph_advantage = graph.Placeholder(np.float32, shape=(None,), name="ph_adv")
self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None,), name="ph_edr")
mu, sigma2 = actor.node
sigma2 += tf.constant(1e-8)
log_std_dev = tf.log(sigma2)
self.entropy = tf.reduce_mean(log_std_dev + tf.constant(0.5 * np.log(2. * np.pi * np.e), tf.float32))
l2_dist = tf.square(self.ph_action.node - mu)
sqr_std_dev = tf.constant(2.) * tf.square(sigma2) + tf.constant(1e-6)
log_std_dev = tf.log(sigma2)
log_prob = -l2_dist / sqr_std_dev - tf.constant(.5) * tf.log(tf.constant(2 * np.pi)) - log_std_dev
self.policy_loss = -(tf.reduce_mean(tf.reduce_sum(log_prob, axis=1) * self.ph_advantage.node)
+ cfg.entropy_beta * self.entropy)
# Learning rate for the Critic is sized by critic_scale parameter
self.value_loss = cfg.critic_scale * tf.reduce_mean(tf.square(self.ph_discounted_reward.node - critic.node))
评论列表
文章目录