def forward_one_step(self, state, action, reward, next_state, test=False):
xp = cuda.cupy if config.use_gpu else np
n_batch = state.shape[0]
state = Variable(state)
next_state = Variable(next_state)
if config.use_gpu:
state.to_gpu()
next_state.to_gpu()
q = self.compute_q_variable(state, test=test)
max_target_q = self.compute_target_q_variable(next_state, test=test)
max_target_q = xp.amax(max_target_q.data, axis=1)
target = q.data.copy()
for i in xrange(n_batch):
if episode_ends[i] is True:
target_value = np.sign(reward[i])
else:
target_value = np.sign(reward[i]) + config.rl_discount_factor * max_target_q[i]
action_index = self.get_index_with_action(action[i])
old_value = target[i, action_index]
diff = target_value - old_value
if diff > 1.0:
target_value = 1.0 + old_value
elif diff < -1.0:
target_value = -1.0 + old_value
target[i, action_index] = target_value
target = Variable(target)
loss = F.mean_squared_error(target, q)
return loss, q
评论列表
文章目录