def build_graph(self, actor, critic, cfg):
self.ph_action = graph.Placeholder(np.float32, shape=(None, actor.action_size), name="ph_action")
self.ph_advantage = graph.Placeholder(np.float32, shape=(None,), name="ph_adv")
self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None,), name="ph_edr")
mu, sigma2 = actor.node
sigma2 += tf.constant(1e-8)
# policy entropy
self.entropy = -tf.reduce_mean(0.5 * (tf.log(2. * np.pi * sigma2) + 1.))
# policy loss (calculation)
b_size = tf.to_float(tf.size(self.ph_action.node) / actor.action_size)
log_pi = tf.log(sigma2)
x_prec = tf.exp(-log_pi)
x_diff = tf.subtract(self.ph_action.node, mu)
x_power = tf.square(x_diff) * x_prec * -0.5
gaussian_nll = (tf.reduce_sum(log_pi, axis=1)
+ b_size * tf.log(2. * np.pi)) / 2. - tf.reduce_sum(x_power, axis=1)
self.policy_loss = -(tf.reduce_mean(gaussian_nll * self.ph_advantage.node) + cfg.entropy_beta * self.entropy)
# value loss
# (Learning rate for the Critic is sized by critic_scale parameter)
self.value_loss = cfg.critic_scale * tf.reduce_mean(tf.square(self.ph_discounted_reward.node - critic.node))
评论列表
文章目录