def _improvePolicy(self):
''' Policy improvement step. '''
policy_stable = True
for s in xrange(self.numStates):
old_action = self.pi[s]
tempV = [0.0] * len(self.actionSet)
# I first get all value-function estimates
for i in xrange(len(self.actionSet)):
nextS, nextR = self.environment.getNextStateAndReward(
s, self.actionSet[i])
tempV[i] = nextR + self.gamma * self.V[nextS]
# Now I take the argmax
self.pi[s] = np.argmax(tempV)
# I break ties always choosing to terminate:
if math.fabs(tempV[self.pi[s]] - tempV[(len(self.actionSet) - 1)]) < 0.001:
self.pi[s] = (len(self.actionSet) - 1)
if old_action != self.pi[s]:
policy_stable = False
return policy_stable
评论列表
文章目录