def true_values_for_sample(
self, states, actions, assume_optimal_policy: bool
):
true_q_values = self.true_q_values(DISCOUNT, assume_optimal_policy)
print("TRUE Q")
print(true_q_values.reshape([5, 5]))
results = []
for x in range(len(states)):
int_state = int(list(states[x].keys())[0])
next_state = self.move_on_index_limit(int_state, actions[x])
if self.is_terminal(int_state):
results.append(self.reward(int_state))
else:
results.append(
self.reward(int_state) +
(DISCOUNT * true_q_values[next_state])
)
return results
评论列表
文章目录