def backward_gpu(self, inputs, grad_outputs):
x, t, W = inputs
gloss, = grad_outputs
n_in = x.shape[1]
gx = cuda.cupy.zeros_like(x)
gW = cuda.cupy.zeros_like(W)
cuda.elementwise(
'''T wxy, raw T x, raw T w, raw int32 ts, raw int32 paths,
raw T codes, raw int32 begins, raw T gloss,
int32 c, int32 max_length''',
'raw T gx, raw T gw',
'''
int ind = i / max_length;
int offset = i - ind * max_length;
int t = ts[ind];
int begin = begins[t];
int length = begins[t + 1] - begins[t];
if (offset < length) {
int p = begin + offset;
int node = paths[p];
T code = codes[p];
T g = -gloss[0] * code / (1.0 + exp(wxy));
for (int j = 0; j < c; ++j) {
int w_ind[] = {node, j};
int x_ind[] = {ind, j};
atomicAdd(&gx[x_ind], g * w[w_ind]);
atomicAdd(&gw[w_ind], g * x[x_ind]);
}
}
''',
'binary_hierarchical_softmax_bwd'
)(self.wxy, x, W, t, self.paths, self.codes, self.begins, gloss, n_in,
self.max_length, gx, gW)
return gx, None, gW
评论列表
文章目录