policy_evaluation.py 文件源码-python代码片段

def state_value_eval(env, policy,
                         discount=0.999,
                         learning_rate=0.01,
                         n_iter=1000,
                         print_every=None):
        """
        This is EVERY-VISIT Monte-Carlo
        :param env: An Environment that we can reset(), step() and get observations and
                    reward information.
        :param policy: A strategy for behaving in an Environment. Should have a step()
                    method that returns an action given state information.
        :param discount: Discount factor for the MDP
        :param learning_rate: The amount we will shift towards an error direction.
        :param n_iter: Number of episodes to run this algorithm for
        :param print_every: Print the current estimate of values every X iterations
        :return: The State-Value function that shows the average return we'll have starting
                 in each one of the states of this MDP
        """
        state_values = [0.0 for _ in range(env.state_space.n)]

        for episode in range(n_iter):
            visited_states, rewards = MonteCarlo._run_episode(env, policy, with_actions=False)
            for i, state in enumerate(visited_states):
                if i + 1 >= len(rewards):
                    break
                discounted_return_from_state = \
                    np.dot(np.array(rewards[i + 1:]),
                           np.fromfunction(lambda i: discount ** i, ((len(rewards) - i - 1),)))
                state_values[state] += \
                    learning_rate * (discounted_return_from_state - state_values[state])
            if print_every is not None and episode % print_every == 0:
                print('State-Value estimation:\n{}'.format(state_values))
        return state_values