model.py 文件源码-python代码片段

def forward_one_step(self, state, action, reward, next_state, test=False):
        xp = cuda.cupy if config.use_gpu else np
        n_batch = state.shape[0]
        state = Variable(state.reshape((n_batch, config.rl_history_length * 34)))
        next_state = Variable(next_state.reshape((n_batch, config.rl_history_length * 34)))
        if config.use_gpu:
            state.to_gpu()
            next_state.to_gpu()
        q = self.compute_q_variable(state, test=test)
        q_ = self.compute_q_variable(next_state, test=test)
        max_action_indices = xp.argmax(q_.data, axis=1)
        if config.use_gpu:
            max_action_indices = cuda.to_cpu(max_action_indices)

        target_q = self.compute_target_q_variable(next_state, test=test)

        target = q.data.copy()

        for i in xrange(n_batch):
            max_action_index = max_action_indices[i]
            target_value = reward[i] + config.rl_discount_factor * target_q.data[i][max_action_indices[i]]
            action_index = self.get_index_for_action(action[i])
            old_value = target[i, action_index]
            diff = target_value - old_value
            if diff > 1.0:
                target_value = 1.0 + old_value  
            elif diff < -1.0:
                target_value = -1.0 + old_value 
            target[i, action_index] = target_value

        target = Variable(target)
        loss = F.mean_squared_error(target, q)
        return loss, q