def calc_loss(self, states, actions, rewards, next_states, episode_ends):
qv = self.agent.q(states)
q_t = self.target(next_states) # Q(s', *)
max_q_prime = np.array(list(map(np.max, q_t.data)), dtype=np.float32) # max_a Q(s', a)
target = cuda.to_cpu(qv.data.copy())
for i in range(self.replay_size):
if episode_ends[i][0] is True:
_r = np.sign(rewards[i])
else:
_r = np.sign(rewards[i]) + self.gamma * max_q_prime[i]
target[i, actions[i]] = _r
td = Variable(self.target.arr_to_gpu(target)) - qv
td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division
td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)
zeros = Variable(self.target.arr_to_gpu(np.zeros((self.replay_size, self.target.n_action), dtype=np.float32)))
loss = F.mean_squared_error(td_clip, zeros)
self._loss = loss.data
self._qv = np.max(qv.data)
return loss
评论列表
文章目录