REINFORCE_baseline.py 文件源码-python代码片段

REINFORCE_baseline.py 文件源码

python

阅读 27 收藏 0 点赞 0 评论 0

项目：pytorch.rl.learning 作者: moskomule 项目源码文件源码

def _loop(self):
        done = False
        total_reward, reward, iter = 0, 0, 0
        self.state = self.env.reset()
        weight = self.weight
        while not done:
            action = self.policy()
            _state, reward, done, _ = self.env.step(action)
            # use current weight to generate an episode
            # \pi(a) = x^{\top}(a)w, where x is feature and w is weight
            # \nabla\ln\pi(a) = x(a)\sum_b \pi(b)x(b)
            delta = reward - self.state_value(_state)
            self.state_value_weight += self.beta * delta * to_tensor(_state).float()
            direction = self.feature(_state, action) - sum(
                [self.softmax @ torch.cat([self.feature(_state, a).unsqueeze(0) for a in self.actions])])
            weight += self.alpha * pow(self.gamma, iter) * delta * direction
            total_reward += reward
            iter += 1
        # update weight
        self.weight = weight
        return total_reward