def _sample(self):
if not self.processed:
self._process()
self.processed = True
indices = (th.rand(self.batch_size) * len(self.rewards)).int()
# TODO: Cleanup
log_actions = []
rewards = []
critics = []
entropies = []
states = []
advantages = []
actions = []
for i in indices:
actions.append(self.actions[i].value)
log_actions.append(self.actions[i].log_prob)
rewards.append(self.rewards[i])
critics.append(self.critics[i])
entropies.append(self.entropies[i])
states.append(self.states[i])
advantages.append(self.advantages[i])
actions = th.cat(actions, 0)
log_actions = th.cat(log_actions, 0)
rewards = th.cat(rewards, 0).view(-1)
critics = th.cat(critics, 0).view(-1)
entropies = th.cat(entropies, 0).view(-1)
states = th.cat(states, 0)
advantages = th.cat(advantages, 0).view(-1)
return actions, log_actions, rewards, critics, entropies, states, advantages
评论列表
文章目录