def forward_gpu(self, inputs):
n = len(inputs)
ptrs = cuda.cupy.asarray([x.data.ptr for x in inputs],
dtype=cuda.cupy.int64)
ws = cuda.cupy.asarray(self.weights, dtype=cuda.cupy.float32)
y = cuda.elementwise(
'T x0, int64 xs, raw W ws, int32 n_xs',
'T y',
'float** xs_ = (float**) xs;'
'y = 0;'
'for (size_t j = 0; j < n_xs; ++j) {'
' y += xs_[j][i] * ws[j];'
'}',
'weighted_sum_arrays'.format(n))(inputs[0],
ptrs.data.ptr,
ws,
len(ptrs))
return y,
python类elementwise()的实例源码
def forward(self, inputs):
x, t = inputs
if chainer.is_debug():
if not ((0 <= t).all() and
(t < x.shape[1]).all()):
msg = 'Each label `t` need to satisfty `0 <= t < x.shape[1]`'
raise ValueError(msg)
xp = cuda.get_array_module(x)
if xp is numpy:
# This code is equivalent to `t.choose(x.T)`, but `numpy.choose`
# does not work when `x.shape[1] > 32`.
return x[six.moves.range(t.size), t],
else:
y = cuda.elementwise(
'S t, raw T x',
'T y',
'int ind[] = {i, t}; y = x[ind];',
'getitem_fwd'
)(t, x)
return y,
def forward(self, inputs):
c_prev, x = inputs
a, i, f, o = _extract_gates(x)
if isinstance(x, numpy.ndarray):
self.a = numpy.tanh(a)
self.i = _sigmoid(i)
self.f = _sigmoid(f)
self.o = _sigmoid(o)
self.c = self.a * self.i + self.f * c_prev
h = self.o * numpy.tanh(self.c)
else:
self.c, h = cuda.elementwise(
'T c_prev, T a, T i_, T f, T o', 'T c, T h',
'''
COMMON_ROUTINE;
c = aa * ai + af * c_prev;
h = ao * tanh(c);
''',
'lstm_fwd', preamble=_preamble)(c_prev, a, i, f, o)
return self.c, h
def _cu_conv_sum(y, x, n):
# Convolutional sum
# TODO(beam2d): Use scan computation
rdim = x.size // (x.shape[0] * x.shape[1])
cuda.elementwise(
'raw T x, int32 rdim, int32 N, int32 n_', 'raw T y',
'''
int half_n = n_ / 2;
int offset = i / rdim * N * rdim + i % rdim;
float sum_part = 0;
for (int j = 0; j < N + half_n; ++j) {
if (j < N) {
sum_part += x[offset + j * rdim];
}
if (j >= n_) {
sum_part -= x[offset + (j - n_) * rdim];
}
if (j >= half_n) {
y[offset + (j - half_n) * rdim] = sum_part;
}
}
''', 'lrn_conv_sum')(x, rdim, x.shape[1], n, y,
size=x.shape[0] * rdim)
def backward_gpu(self, inputs, grad_outputs):
cupy = cuda.cupy
x, t = inputs
gloss = grad_outputs[0]
n_unit = t.size // len(t)
coeff = gloss * self._coeff
gx = cuda.elementwise(
'T y, S t, raw T coeff, S n_channel, S n_unit',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = ((t == -1) || (c != t)) ? 0 : (coeff[0] / max(y, 1e-5));
''',
'softmax_crossent_bwd')(
self.y, cupy.expand_dims(t, 1), -coeff, x.shape[1], n_unit)
return gx, None
def backward_gpu(self, inputs, grad_outputs):
cupy = cuda.cupy
x, t = inputs
if hasattr(self, 'y'):
y = self.y
else:
y = softmax_log(x, self.use_cudnn)
cupy.exp(y, out=y)
gloss = grad_outputs[0]
n_unit = t.size // len(t)
coeff = gloss * self._coeff
gx = cuda.elementwise(
'T y, S t, raw T coeff, S n_channel, S n_unit',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = (t == -1) ? 0 : (coeff[0] * (y - (c == t)));
''',
'softmax_crossent_bwd')(
y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit)
return gx, None
def sample_gpu(self, shape):
ps = cuda.cupy.random.uniform(size=shape, dtype=numpy.float32)
vs = cuda.elementwise(
'T ps, raw T threshold , raw S values, int32 b',
'int32 vs',
'''
T pb = ps * b;
int index = __float2int_rd(pb);
// fill_uniform sometimes returns 1.0, so we need to check index
if (index >= b) {
index = 0;
}
int lr = threshold[index] < pb - index;
vs = values[index * 2 + lr];
''',
'walker_alias_sample'
)(ps, self.threshold, self.values, len(self.threshold))
return vs
def forward_gpu(self, inputs):
x, gamma, beta = inputs
mean = x.mean(axis=(0, 1), keepdims=True)
var = x.var(axis=(0, 1), keepdims=True) + self.eps
normalize = cuda.elementwise(
'T x, T var, T mean, T gamma, T beta',
'T std, T x_hat, T y',
'std = sqrtf(var);'
'x_hat = (x - mean) / std;'
'y = gamma * x_hat + beta;',
'normalize')
self.std, self.x_hat, y = normalize(x, var, mean, gamma, beta)
return y,
def __call__(self, opt):
if cuda.available:
kernel = cuda.elementwise(
'T low, T high',
'T p',
'p = (p < low) ? low : (p > high) ? high : p',
'weight_clip')
for link in opt.target.links():
# only apply to binary layers
if getattr(link,'cname',False):
for param in link.params():
p = param.data
with cuda.get_device(p) as dev:
if int(dev) == -1:
numpy.clip(p, self.low, self.high)
else:
kernel(self.low, self.high, p)
def forward_gpu(self, inputs):
x, gamma, beta = inputs
mean = x.mean(axis=(0, 1), keepdims=True)
var = x.var(axis=(0, 1), keepdims=True) + self.eps
normalize = cuda.elementwise(
'T x, T var, T mean, T gamma, T beta',
'T std, T x_hat, T y',
'std = sqrtf(var);'
'x_hat = (x - mean) / std;'
'y = gamma * x_hat + beta;',
'normalize')
self.std, self.x_hat, y = normalize(x, var, mean, gamma, beta)
return y,
def backward_gpu(self, inputs, grad_outputs):
cupy = cuda.cupy
x, t = inputs
if hasattr(self, 'y'):
y = self.y
else:
y = x
gloss = grad_outputs[0]
n_unit = t.size // len(t)
coeff = gloss * self._coeff
gx = cuda.elementwise(
'T y, S t, raw T coeff, S n_channel, S n_unit',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = (t == -1 || c != t) ? 0 : (coeff[0] * -1.0 / y);
''',
'crossent_bwd')(
y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit)
return gx, None
def backward_gpu(self, inputs, grad_outputs):
cupy = cuda.cupy
x, t, w = inputs
if hasattr(self, 'y'):
y = self.y
else:
y = softmax_log(x, self.use_cudnn)
cupy.exp(y, out=y)
gloss = grad_outputs[0]
n_unit = t.size // len(t)
coeff = gloss * self._coeff * w
gx = cuda.elementwise(
'T y, S t, raw T coeff, S n_channel, S n_unit',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = (t == -1) ? 0 : (coeff[0] * (y - (c == t)));
''',
'softmax_crossent_bwd')(
y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit)
return gx, None, None
def __call__(self, opt):
if cuda.available:
kernel = cuda.elementwise(
'T low, T high',
'T p',
'p = (p < low) ? low : (p > high) ? high : p',
'weight_clip')
for param in opt.target.params():
p = param.data
with cuda.get_device(p) as dev:
if int(dev) == -1:
numpy.clip(p, self.low, self.high)
else:
kernel(self.low, self.high, p)
def forward_gpu(self, x):
y = cuda.elementwise(
'T x', 'T y',
'y = x >= 0 ? 1 : -1', 'bst_fwd')(
x[0])
return y,
def backward_gpu(self, x, gy):
gx = cuda.elementwise(
'T x, T gy', 'T gx',
'gx = abs(x) > 1 ? 0 : gy', 'bst_bwd')(
x[0], gy[0])
return gx,
def _kern():
return cuda.elementwise(
'T x', 'T y',
'y = x >= 0 ? 1 : -1',
'binarize')
def update_core_gpu(self, param):
grad = param.grad
if grad is None:
return
cuda.elementwise(
'T grad, T lr, T alpha, T eps',
'T param, T ms',
'''ms = alpha * ms + (1 - alpha) * grad * grad;
param -= lr * grad / sqrt(ms + eps);''',
'rmsprop')(grad, self.hyperparam.lr, self.hyperparam.alpha,
self.hyperparam.eps, param.data, self.state['ms'])
def __call__(self, opt):
if cuda.available:
kernel = cuda.elementwise(
'T p, T decay', 'T g', 'g += decay * p', 'weight_decay')
rate = self.rate
for name, param in opt.target.namedparams():
if name == 'b' or name.endswith('/b'):
continue
p, g = param.data, param.grad
with cuda.get_device(p) as dev:
if int(dev) == -1:
g += rate * p
else:
kernel(p, rate, g)
def forward_gpu(self, inputs):
n = len(inputs)
ptrs = cuda.cupy.asarray([x.data.ptr for x in inputs],
dtype=cuda.cupy.int64)
y = cuda.elementwise(
'T x0, int64 xs, int32 n_xs',
'T y',
'float** xs_ = (float**) xs;'
'y = 0;'
'for (size_t j = 0; j < n_xs; ++j) {'
' y += xs_[j][i];'
'}',
'sum_arrays'.format(n))(inputs[0], ptrs.data.ptr, len(ptrs))
return y,
def backward(self, inputs, grad_outputs):
x, gamma, beta = inputs[:3]
gy = grad_outputs[0]
head_ndim = gamma.ndim + 1
expander = (None, Ellipsis) + (None,) * (x.ndim - head_ndim)
m = gamma.dtype.type(x.size // gamma.size)
axis = (2, 3)
gamma_beta_axis = (0, 2, 3)
mean_var_expander = (Ellipsis, None, None)
xp = cuda.get_array_module(x)
gbeta = gy.sum(axis=gamma_beta_axis)
ggamma = (gy * self.x_hat).sum(axis=gamma_beta_axis)
if xp is numpy:
gx = (gamma / self.std)[mean_var_expander] * (
gy - (self.x_hat * ggamma[mean_var_expander] + gbeta[mean_var_expander]) / m)
else:
inv_m = numpy.float32(1) / m
gx = cuda.elementwise(
'T gy, T x_hat, T gamma, T std, T ggamma, T gbeta, \
T inv_m',
'T gx',
'gx = (gamma / std) * (gy - (x_hat * ggamma + gbeta) * \
inv_m)',
'bn_bwd')(gy, self.x_hat, gamma[expander],
self.std[mean_var_expander], ggamma[mean_var_expander],
gbeta[mean_var_expander], inv_m)
return gx, ggamma, gbeta
def backward_gpu(self, inputs, grad_outputs):
cupy = cuda.cupy
x, t = inputs
if hasattr(self, 'y'):
y = self.y
else:
y = log_softmax._log_softmax(x, self.use_cudnn)
cupy.exp(y, out=y)
gloss = grad_outputs[0]
n_unit = t.size // len(t)
coeff = gloss * self._coeff
if self.class_weight is None:
gx = cuda.elementwise(
'T y, S t, raw T coeff, S n_channel, S n_unit',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = (t == -1) ? 0 : (coeff[0] * (y - (c == t)));
''',
'softmax_crossent_bwd')(
y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit)
else:
gx = cuda.elementwise(
'T y, raw T w, S t, raw T coeff, S n_channel, S n_unit',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = t == -1 ? 0 : coeff[0] * (y - (c == t)) * w[t];
''',
'softmax_crossent_bwd')(
y, self.class_weight, cupy.expand_dims(t, 1), coeff,
x.shape[1], n_unit)
return gx, None
def update_one_gpu(self, param, state):
cuda.elementwise(
'T grad, T lr, T alpha, T eps',
'T param, T ms',
'''ms = alpha * ms + (1 - alpha) * grad * grad;
param -= lr * grad / sqrt(ms + eps);''',
'rmsprop')(param.grad, self.lr, self.alpha, self.eps,
param.data, state['ms'])
def __call__(self, opt):
if cuda.available:
kernel = cuda.elementwise(
'T p, T decay', 'T g', 'g += decay * p', 'weight_decay')
rate = self.rate
for name, param in opt.target.namedparams():
if name == 'b' or name.endswith('/b'):
continue
p, g = param.data, param.grad
with cuda.get_device(p) as dev:
if int(dev) == -1:
g += rate * p
else:
kernel(p, rate, g)
def backward_gpu(self, inputs, grad_outputs):
cupy = cuda.cupy
x, t = inputs
if hasattr(self, 'y'):
y = self.y
else:
y = log_softmax._log_softmax(x)
cupy.exp(y, out=y)
gloss = grad_outputs[0]
n_unit = t.size // len(t)
if self.reduce == 'mean':
coeff = gloss * self._coeff
else:
coeff = gloss[:, None, ...]
if self.class_weight is None:
gx = cuda.elementwise(
'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = t == ignore_label ? 0 : coeff * (y - (c == t));
''',
'softmax_crossent_bwd')(
y, cupy.expand_dims(t, 1), coeff, x.shape[1],
n_unit, self.ignore_label)
else:
gx = cuda.elementwise(
'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
'S ignore_label',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
''',
'softmax_crossent_weight_bwd')(
y, self.class_weight, cupy.expand_dims(t, 1), coeff,
x.shape[1], n_unit, self.ignore_label)
return gx, None
def backward_gpu(self, inputs, grad_outputs):
x, t, W = inputs
gloss, = grad_outputs
n_in = x.shape[1]
gx = cuda.cupy.zeros_like(x)
gW = cuda.cupy.zeros_like(W)
cuda.elementwise(
'''T wxy, raw T x, raw T w, raw int32 ts, raw int32 paths,
raw T codes, raw int32 begins, raw T gloss,
int32 c, int32 max_length''',
'raw T gx, raw T gw',
'''
int ind = i / max_length;
int offset = i - ind * max_length;
int t = ts[ind];
int begin = begins[t];
int length = begins[t + 1] - begins[t];
if (offset < length) {
int p = begin + offset;
int node = paths[p];
T code = codes[p];
T g = -gloss[0] * code / (1.0 + exp(wxy));
for (int j = 0; j < c; ++j) {
int w_ind[] = {node, j};
int x_ind[] = {ind, j};
atomicAdd(&gx[x_ind], g * w[w_ind]);
atomicAdd(&gw[w_ind], g * x[x_ind]);
}
}
''',
'binary_hierarchical_softmax_bwd'
)(self.wxy, x, W, t, self.paths, self.codes, self.begins, gloss, n_in,
self.max_length, gx, gW)
return gx, None, gW
def update_one_gpu(self, param, state):
cuda.elementwise(
'T grad, T lr, T alpha, T eps',
'T param, T ms',
'''ms = alpha * ms + (1 - alpha) * grad * grad;
param -= lr * grad / (sqrt(ms) + eps);''',
'rmsprop')(param.grad, self.lr, self.alpha, self.eps,
param.data, state['ms'])
def update_one_gpu(self, param, state):
cuda.elementwise(
'T grad, T lr, T alpha, T momentum, T eps',
'T param, T avg_n, T avg_g, T delta',
'''avg_n = alpha * avg_n + (1 - alpha) * grad * grad;
avg_g = alpha * avg_g + (1 - alpha) * grad;
delta = delta * momentum -
lr * grad * rsqrt(avg_n - avg_g * avg_g + eps);
param += delta;''',
'rmsprop_graves')(
param.grad, self.lr, self.alpha, self.momentum, self.eps,
param.data, state['n'], state['g'], state['delta'])
def update_one_gpu(self, param, state):
cuda.elementwise(
'T grad, T lr, T momentum',
'T param, T v',
'''v = v * momentum - lr * grad;
param += momentum * momentum * v - (1 + momentum) * lr * grad;
''',
'nesterov_ag')(param.grad, self.lr, self.momentum,
param.data, state['v'])
def update_one_gpu(self, param, state):
cuda.elementwise(
'T grad, T lr, T momentum',
'T param, T v',
'''v = momentum * v - lr * grad;
param += v;''',
'momentum_sgd')(param.grad, self.lr, self.momentum,
param.data, state['v'])
def update_one_gpu(self, param, state):
cuda.elementwise(
'T grad, T lr, T eps',
'T param, T h',
'''h += grad * grad;
param -= lr * grad / (sqrt(h) + eps);''',
'adagrad')(param.grad, self.lr, self.eps,
param.data, state['h'])