def state_value_eval(env, policy,
discount=0.999,
learning_rate=0.01,
n_iter=1000,
print_every=None):
"""
This is EVERY-VISIT Monte-Carlo
:param env: An Environment that we can reset(), step() and get observations and
reward information.
:param policy: A strategy for behaving in an Environment. Should have a step()
method that returns an action given state information.
:param discount: Discount factor for the MDP
:param learning_rate: The amount we will shift towards an error direction.
:param n_iter: Number of episodes to run this algorithm for
:param print_every: Print the current estimate of values every X iterations
:return: The State-Value function that shows the average return we'll have starting
in each one of the states of this MDP
"""
state_values = [0.0 for _ in range(env.state_space.n)]
for episode in range(n_iter):
visited_states, rewards = MonteCarlo._run_episode(env, policy, with_actions=False)
for i, state in enumerate(visited_states):
if i + 1 >= len(rewards):
break
discounted_return_from_state = \
np.dot(np.array(rewards[i + 1:]),
np.fromfunction(lambda i: discount ** i, ((len(rewards) - i - 1),)))
state_values[state] += \
learning_rate * (discounted_return_from_state - state_values[state])
if print_every is not None and episode % print_every == 0:
print('State-Value estimation:\n{}'.format(state_values))
return state_values
policy_evaluation.py 文件源码
python
阅读 28
收藏 0
点赞 0
评论 0
评论列表
文章目录