def action_value_eval(env, policy,
discount=0.999, learning_rate=0.01,
n_iter=1000, print_every=None):
action_values = [[0.0 for _ in range(env.action_space.n)] for _ in range(env.state_space.n)]
for episode in range(n_iter):
visited_state_action_pairs, rewards = MonteCarlo._run_episode(env, policy, with_actions=True)
for i, (state, action) in enumerate(visited_state_action_pairs):
if i + 1 >= len(rewards):
break
discounted_return_from_state = \
np.dot(np.array(rewards[i + 1:]),
np.fromfunction(lambda i: discount ** i, ((len(rewards) - i - 1),)))
action_values[state][action] += \
learning_rate * (discounted_return_from_state - action_values[state][action])
if print_every is not None and episode % print_every == 0:
print('Action-Value estimation:\n{}'.format(action_values))
return action_values
policy_evaluation.py 文件源码
python
阅读 29
收藏 0
点赞 0
评论 0
评论列表
文章目录