Policy.py 文件源码-python代码片段

def valueIteration(self, debugCallback = None, turbo = False):
        '''using the value iteration algorithm (see AI: A Modern Approach (Third ed.) pag. 652)
           calculate the utilities for all states in the grid world

           the debugCallback must be a function that has three parameters:
                policy: that the function can use to display intermediate results
                isEnded: that the function can use to know if the valueIteration is ended
            the debugCallback must return True, and can stop the algorithm returning False

            the algorithm has a maximum number of iterations, in this way we can compute an
            example with a discount factor = 1 that converge.

            the turbo mode uses the utility vector of the (i-1)-th iteration to compute
            the utility vector of the i-th iteration. The classic approach is different because
            we compute the i-th iteration using the utility vector of the (i-1)-th iteration.
            With this algorithm, using the turbo mode, we have an improvement of 30%

           returns the number of iterations it needs for converge
        '''
        eps = Policy.valueIterationEpsilon
        dfact = self.world.discFactor
        c, r = self.world.size
        if turbo: newUv = self.utilities

        reiterate = True
        start = time.process_time()
        while(reiterate):
            self.numOfIterations += 1
            maxNorm = 0 #see the max norm definition in AI: A Modern Approach (Third ed.) pag. 654

            if not turbo: newUv = self.__createEmptyUtilityVector()

            for x in range(c):
                for y in range(r):
                    v = self.__cellUtility(x, y) #calculate using the self.utilities (i.e. the previous step)
                    if not v is None: maxNorm = max(maxNorm, abs(self.utilities[y][x] - v))
                    newUv[y][x] = v #update the new utility vector that we are creating

            if not turbo: self.utilities = newUv

            if debugCallback: reiterate = debugCallback(self, False)

            if maxNorm <= eps * (1 - dfact)/dfact: reiterate = False

            end = time.process_time()
            self.elapsed = end - start
            if self.numOfIterations >= Policy.maxNumberOfIterations or self.elapsed > Policy.timeToLive:
                reiterate = False
                print("warning: max number of iterations exceeded")
                messagebox.showwarning("Warning", "max number of iterations exceeded")

        if debugCallback: reiterate = debugCallback(self, True)

        return self.numOfIterations