def _loop(self):
done = False
total_reward, reward, iter = 0, 0, 0
self.state = self.env.reset()
weight = self.weight
while not done:
action = self.policy()
_state, reward, done, _ = self.env.step(action)
# use current weight to generate an episode
# \pi(a) = x^{\top}(a)w, where x is feature and w is weight
# \nabla\ln\pi(a) = x(a)\sum_b \pi(b)x(b)
direction = self.feature(_state, action) - sum(
[self.softmax @ torch.cat([self.feature(_state, a).unsqueeze(0) for a in self.actions])])
weight += self.alpha * pow(self.gamma, iter) * reward * direction
total_reward += reward
iter += 1
# update weight
self.weight = weight
return total_reward
评论列表
文章目录