ppo.py 文件源码-python代码片段

ppo.py 文件源码
python
阅读 36 收藏 0 点赞 0 评论 0
def _sample(self):
        if not self.processed:
            self._process()
            self.processed = True
        indices = (th.rand(self.batch_size) * len(self.rewards)).int()
        # TODO: Cleanup
        log_actions = []
        rewards = []
        critics = []
        entropies = []
        states = []
        advantages = []
        actions = []
        for i in indices:
            actions.append(self.actions[i].value)
            log_actions.append(self.actions[i].log_prob)
            rewards.append(self.rewards[i])
            critics.append(self.critics[i])
            entropies.append(self.entropies[i])
            states.append(self.states[i])
            advantages.append(self.advantages[i])
        actions = th.cat(actions, 0)
        log_actions = th.cat(log_actions, 0)
        rewards = th.cat(rewards, 0).view(-1)
        critics = th.cat(critics, 0).view(-1)
        entropies = th.cat(entropies, 0).view(-1)
        states = th.cat(states, 0)
        advantages = th.cat(advantages, 0).view(-1)
        return actions, log_actions, rewards, critics, entropies, states, advantages