def update_policy(self, ob_no, ac_n, std_adv_n, stepsize):
"""
The input is the same for the discrete control case, except we return a
single log standard deviation vector in addition to our logits. In this
case, the logits are really the mean vector of Gaussians, which differs
among components (observations) in the minbatch. We return the *old*
ones since they are assigned, then `self.update_op` runs, which makes
them outdated.
"""
feed = {self.ob_no: ob_no,
self.ac_na: ac_n,
self.adv_n: std_adv_n,
self.stepsize: stepsize}
_, surr_loss, oldmean_na, oldlogstd_a = self.sess.run(
[self.update_op, self.surr_loss, self.mean_na, self.logstd_a],
feed_dict=feed)
return surr_loss, oldmean_na, oldlogstd_a
评论列表
文章目录