def grad(self, inp, grads):
dy, sm, y_idx = inp
g_dx, = grads
# TODO: currently we do not compute the gradient w.r.t. dy, because
# advanced indexing is not working yet. When it works, do it to avoid
# potentially misleading behavior in gradient computations! (although
# typically we should not need the gradient w.r.t. dy).
y_idx_range = tensor.arange(y_idx.shape[0])
g_dy = tensor.sum(
g_dx * subtensor.AdvancedIncSubtensor()(
sm, tensor.fill(dy, -1), y_idx_range, y_idx), axis=1)
g_sm = dy.dimshuffle(0, 'x') * g_dx
g_y_idx = grad_not_implemented(self, 2, y_idx)
return [g_dy, g_sm, g_y_idx]
评论列表
文章目录