def _calculate_transition_prob(self, current, delta):
new_position = np.array(current) + np.array(delta)
new_position = self._limit_coordinates(new_position).astype(int)
new_state = np.ravel_multi_index(tuple(new_position), self.shape)
# Newer version of rewards/costs from G-learning paper
# reward = -100.0 if self._cliff[tuple(new_position)] else -1.0
reward = -1.0
if self._cliff[tuple(new_position)]:
reward = -100.0
elif tuple(new_position) == (3,11):
reward = 0.0
is_done = self._cliff[tuple(new_position)] or (tuple(new_position) == (3,11))
return [(1.0, new_state, reward, is_done)]
评论列表
文章目录