def _compute_y_and_t(self, exp_batch, gamma):
batch_state = exp_batch['state']
batch_size = len(exp_batch['reward'])
qout = self.q_function(batch_state)
batch_actions = exp_batch['action']
# Q(s_t,a_t)
batch_q = F.reshape(qout.evaluate_actions(
batch_actions), (batch_size, 1))
with chainer.no_backprop_mode():
# Compute target values
target_qout = self.target_q_function(batch_state)
# Q'(s_t,a_t)
target_q = F.reshape(target_qout.evaluate_actions(
batch_actions), (batch_size, 1))
# LQ'(s_t,a)
target_q_expect = F.reshape(
self._l_operator(target_qout), (batch_size, 1))
# r + g * LQ'(s_{t+1},a)
batch_q_target = F.reshape(
self._compute_target_values(exp_batch, gamma), (batch_size, 1))
# Q'(s_t,a_t) + r + g * LQ'(s_{t+1},a) - LQ'(s_t,a)
t = target_q + batch_q_target - target_q_expect
return batch_q, t
评论列表
文章目录