def _build_loss(self):
self.rewards = tf.placeholder(tf.float32, [None])
self.actions = tf.placeholder(tf.uint8, [None])
self.adv = tf.placeholder(tf.float32, [None], name="adv")
a_one_hot = tf.one_hot(self.actions, self.action_dim)
log_prob = tf.log(self.pf + 1e-6)
log_pi_a_given_s = tf.reduce_sum(log_prob * a_one_hot, 1)
policy_loss = -tf.reduce_sum(log_pi_a_given_s * self.adv)
value_loss = tf.nn.l2_loss(self.vf-self.rewards) # tf.maximum(self.entropy_beta,1)
entropy_beta = linear_decrise_op(self.eb, self.global_step, 'entropy_beta')
xentropy_loss = tf.reduce_sum(self.pf * log_prob)
self.total_loss = policy_loss + 0.5 * value_loss + entropy_beta * xentropy_loss
batch_size = tf.cast(tf.shape(self.rewards)[0], tf.float32)
#self.total_loss = tf.truediv(self.total_loss,batch_size,name='total_loss')
self.for_summary_scalar += [
tf.reduce_mean(self.adv, name='adv'),
tf.reduce_mean(self.vf, name='value_mean'),
tf.reduce_mean(log_pi_a_given_s, name='log_p_mean'),
tf.reduce_mean(self.rewards, name="true_value_mean"),
tf.identity(policy_loss/batch_size, name="policy_loss"),
tf.identity(value_loss/batch_size, name="value_loss"),
tf.identity((entropy_beta * xentropy_loss)/batch_size, name = 'entropy_loss'),
entropy_beta,
# self.lr,
tf.identity(self.total_loss, name = 'total_loss')
]
self.for_summary_hist += [tf.argmax(self.pf, axis=1, name='action_predicted')]
评论列表
文章目录