def _step(self, action):
# the first element of action is the actual current action
current_action = action[0]
observation, reward, done, info = self.cartpole._step(current_action)
if not done:
# We add the newly predicted observations to the list before checking predictions
# in order to give the agent a chance to predict the observations that they
# are going to get _this_ round.
self.predicted_observations.append(action[1:])
if self.iteration > TIME_BEFORE_BONUS_ALLOWED:
for i in xrange(min(NUM_PREDICTED_OBSERVATIONS, len(self.predicted_observations))):
l2dist = np.sqrt(np.sum(np.square(np.subtract(
self.predicted_observations[-(i + 1)][i],
observation
))))
bonus = CORRECT_PREDICTION_BONUS * (1 - math.erf(l2dist))
reward += bonus
self.iteration += 1
return observation, reward, done, info
评论列表
文章目录