dpp.py 文件源码-python代码片段

dpp.py 文件源码
python
阅读 21 收藏 0 点赞 0 评论 0
def _compute_y_and_t(self, exp_batch, gamma):

        batch_state = exp_batch['state']
        batch_size = len(exp_batch['reward'])

        qout = self.q_function(batch_state)

        batch_actions = exp_batch['action']
        # Q(s_t,a_t)
        batch_q = F.reshape(qout.evaluate_actions(
            batch_actions), (batch_size, 1))

        with chainer.no_backprop_mode():
            # Compute target values
            target_qout = self.target_q_function(batch_state)

            # Q'(s_t,a_t)
            target_q = F.reshape(target_qout.evaluate_actions(
                batch_actions), (batch_size, 1))

            # LQ'(s_t,a)
            target_q_expect = F.reshape(
                self._l_operator(target_qout), (batch_size, 1))

            # r + g * LQ'(s_{t+1},a)
            batch_q_target = F.reshape(
                self._compute_target_values(exp_batch, gamma), (batch_size, 1))

            # Q'(s_t,a_t) + r + g * LQ'(s_{t+1},a) - LQ'(s_t,a)
            t = target_q + batch_q_target - target_q_expect

        return batch_q, t