def _loop(self):
done = False
total_reward, reward, iter = 0, 0, 0
self.state = self.env.reset()
weight = self.weight
while not done:
action = self.policy()
_state, reward, done, _ = self.env.step(action)
# use current weight to generate an episode
# \pi(a) = x^{\top}(a)w, where x is feature and w is weight
# \nabla\ln\pi(a) = x(a)\sum_b \pi(b)x(b)
delta = reward - self.state_value(_state)
self.state_value_weight += self.beta * delta * to_tensor(_state).float()
direction = self.feature(_state, action) - sum(
[self.softmax @ torch.cat([self.feature(_state, a).unsqueeze(0) for a in self.actions])])
weight += self.alpha * pow(self.gamma, iter) * delta * direction
total_reward += reward
iter += 1
# update weight
self.weight = weight
return total_reward
REINFORCE_baseline.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录