Learning.py 文件源码-python代码片段

Learning.py 文件源码

python

阅读 38 收藏 0 点赞 0 评论 0

def _improvePolicy(self):
        ''' Policy improvement step. '''
        policy_stable = True
        for s in xrange(self.numStates):
            old_action = self.pi[s]
            tempV = [0.0] * len(self.actionSet)
            # I first get all value-function estimates
            for i in xrange(len(self.actionSet)):
                nextS, nextR = self.environment.getNextStateAndReward(
                    s, self.actionSet[i])
                tempV[i] = nextR + self.gamma * self.V[nextS]

            # Now I take the argmax
            self.pi[s] = np.argmax(tempV)
            # I break ties always choosing to terminate:
            if math.fabs(tempV[self.pi[s]] - tempV[(len(self.actionSet) - 1)]) < 0.001:
                self.pi[s] = (len(self.actionSet) - 1)
            if old_action != self.pi[s]:
                policy_stable = False

        return policy_stable