def set_evaluation_feedback(self, feedbacks):
"""Set feedback for the last behavior.
Parameters
----------
feedbacks : list of float
feedback for each step or for the episode, depends on the problem
"""
visited_states = self.policy.visited_states
actions_taken = self.policy.actions_taken
n_steps = len(visited_states)
assert n_steps == len(feedbacks)
assert n_steps == len(actions_taken)
gammas = np.hstack(
((1,), np.cumprod(np.ones(n_steps - 1) * self.gamma)))
diff = 0.0
for t in range(n_steps):
s = visited_states[t]
a = actions_taken[t]
ret = sum(feedbacks[t:] * gammas[:n_steps - t])
self.returns[s][a].append(ret)
last_Q = self.Q[s][a]
self.Q[s][a] = np.mean(self.returns[s][a])
diff = max(diff, np.abs(last_Q - self.Q[s][a]))
self.done = any(feedbacks > 0) and diff < 1e-3
评论列表
文章目录