def _loop(self):
done = False
total_reward, reward, iter = 0, 0, 0
self.state = self.env.reset()
while not done:
action = self.policy()
_state, reward, done, _ = self.env.step(action)
# if _state is terminal, state value is 0
v = 0 if done else self.state_value(_state)
delta = reward + self.gamma * v - self.state_value(self.state)
# \nabla_w v = s, since v = s^{\tim} w
self.state_value_weight += self.beta * delta * to_tensor(self.state).float()
# \pi(a) = x^{\top}(a)w, where x is feature and w is weight
# \nabla\ln\pi(a) = x(a)\sum_b \pi(b)x(b)
direction = self.feature(_state, action) - sum(
[self.softmax @ torch.cat([self.feature(_state, a).unsqueeze(0) for a in self.actions])])
self.weight += self.alpha * pow(self.gamma, iter) * delta * direction
total_reward += reward
self.state = _state
iter += 1
return total_reward
评论列表
文章目录