def learn_single(self, value, value_last, last_action, reward):
expected_value = self.gamma * value + reward # What value_last should have been if it was perfect
value_loss = F.smooth_l1_loss(expected_value, value_last)
print(value_loss.data)
last_action.reinforce(value_loss.data[0])
self.optimizer.zero_grad()
final_nodes = [value_loss, last_action]
gradients = [maybe_cuda(torch.ones(1)), None]
autograd.backward(final_nodes, gradients, retain_graph=True)
self.optimizer.step()
del last_action
评论列表
文章目录