def forward_one_step(self, state, action, reward, next_state, test=False):
xp = cuda.cupy if config.use_gpu else np
n_batch = state.shape[0]
state = Variable(state.reshape((n_batch, config.rl_history_length * 34)))
next_state = Variable(next_state.reshape((n_batch, config.rl_history_length * 34)))
if config.use_gpu:
state.to_gpu()
next_state.to_gpu()
q = self.compute_q_variable(state, test=test)
q_ = self.compute_q_variable(next_state, test=test)
max_action_indices = xp.argmax(q_.data, axis=1)
if config.use_gpu:
max_action_indices = cuda.to_cpu(max_action_indices)
target_q = self.compute_target_q_variable(next_state, test=test)
target = q.data.copy()
for i in xrange(n_batch):
max_action_index = max_action_indices[i]
target_value = reward[i] + config.rl_discount_factor * target_q.data[i][max_action_indices[i]]
action_index = self.get_index_for_action(action[i])
old_value = target[i, action_index]
diff = target_value - old_value
if diff > 1.0:
target_value = 1.0 + old_value
elif diff < -1.0:
target_value = -1.0 + old_value
target[i, action_index] = target_value
target = Variable(target)
loss = F.mean_squared_error(target, q)
return loss, q
评论列表
文章目录