def __init__(self, use_cudnn=True, normalize=True, cache_score=True,
class_weight=None, ignore_label=-1, reduce='mean'):
self.use_cudnn = use_cudnn
self.normalize = normalize
self.cache_score = cache_score
self.class_weight = class_weight
if class_weight is not None:
if self.class_weight.ndim != 1:
raise ValueError('class_weight.ndim should be 1')
if self.class_weight.dtype.kind != 'f':
raise ValueError('The dtype of class_weight should be \'f\'')
if isinstance(self.class_weight, chainer.Variable):
raise ValueError('class_weight should be a numpy.ndarray or '
'cupy.ndarray, not a chainer.Variable')
self.ignore_label = ignore_label
if reduce not in ('mean', 'no'):
raise ValueError(
"only 'mean' and 'no' are valid for 'reduce', but '%s' is "
'given' % reduce)
self.reduce = reduce
python类reduce()的实例源码
def forward_gpu(self, inputs):
cupy = cuda.cupy
x, t = inputs
log_y = cupy.log(x + 1e-5)
self.y = x
if(self.debug):
ipdb.set_trace()
if getattr(self, 'normalize', True):
coeff = cupy.maximum(1, (t != self.ignore_label).sum())
else:
coeff = max(1, len(t))
self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
ret = cuda.reduce(
'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
't == -1 ? 0 : log_y[_j * n_channel + t]',
'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
return ret,
def forward_gpu(self, inputs):
cupy = cuda.cupy
x, t = inputs
if chainer.is_debug():
self._check_input_values(x, t)
log_y = softmax_log(x, self.use_cudnn)
if self.cache_score:
self.y = cupy.exp(log_y)
if getattr(self, 'normalize', True):
coeff = cupy.maximum(1, (t != self.ignore_label).sum())
else:
coeff = max(1, len(t))
self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
ret = cuda.reduce(
'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
't == -1 ? T(0) : log_y[_j * n_channel + t]',
'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
return ret,
def forward_gpu(self, inputs):
cupy = cuda.cupy
x, t = inputs
log_y = cupy.log(x + 1e-5)
self.y = x
if(self.debug):
ipdb.set_trace()
if getattr(self, 'normalize', True):
coeff = cupy.maximum(1, (t != self.ignore_label).sum())
else:
coeff = max(1, len(t))
self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
ret = cuda.reduce(
'S t, raw T log_y, int32 n_channel, raw T coeff, raw T weights', 'T out',
't == -1 ? 0 : log_y[_j * n_channel + t] * weights[t]',
'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.weights.reduced_view())
return ret,
def forward_gpu(self, inputs):
cupy = cuda.cupy
x, t = inputs
if chainer.is_debug():
self._check_input_values(x, t)
log_y = cupy.log(x)
if self.cache_score:
self.y = x
if getattr(self, 'normalize', True):
coeff = cupy.maximum(1, (t != self.ignore_label).sum())
else:
coeff = max(1, len(t))
self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
ret = cuda.reduce(
'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
't == -1 ? 0 : log_y[_j * n_channel + t]',
'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
return ret,
def __init__(self, use_cudnn=True, normalize=True, cache_score=True,
class_weight=None, ignore_label=-1, reduce='mean'):
self.use_cudnn = use_cudnn
self.normalize = normalize
self.cache_score = cache_score
self.class_weight = class_weight
if class_weight is not None:
if self.class_weight.ndim != 1:
raise ValueError('class_weight.ndim should be 1')
if self.class_weight.dtype.kind != 'f':
raise ValueError('The dtype of class_weight should be \'f\'')
if isinstance(self.class_weight, chainer.Variable):
raise ValueError('class_weight should be a numpy.ndarray or '
'cupy.ndarray, not a chainer.Variable')
self.ignore_label = ignore_label
if reduce not in ('mean', 'no'):
raise ValueError(
"only 'mean' and 'no' are valid for 'reduce', but '%s' is "
'given' % reduce)
self.reduce = reduce
def forward_gpu(self, inputs):
cupy = cuda.cupy
x, t, w = inputs
if chainer.is_debug():
self._check_input_values(x, t)
log_y = softmax_log(x, self.use_cudnn)
if self.cache_score:
self.y = cupy.exp(log_y)
if getattr(self, 'normalize', True):
coeff = cupy.maximum(1, (t != self.ignore_label).sum())
else:
coeff = max(1, len(t))
self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
ret = cuda.reduce(
'S t, T w, raw T log_y, int32 n_channel, raw T coeff', 'T out',
't == -1 ? T(0) : log_y[_j * n_channel + t] * w',
'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
)(t, w, log_y.reduced_view(), log_y.shape[-1], self._coeff)
return ret,
def forward_gpu(self, inputs):
cupy = cuda.cupy
x, t = inputs
if chainer.is_debug():
self._check_input_values(x, t)
log_y = log_softmax._log_softmax(x, self.use_cudnn)
if self.cache_score:
self.y = cupy.exp(log_y)
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
log_y *= cupy.broadcast_to(
self.class_weight.reshape(shape), x.shape)
if self.normalize:
coeff = cupy.maximum(1, (t != self.ignore_label).sum())
else:
coeff = max(1, len(t))
self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
ret = cuda.reduce(
'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
't == -1 ? T(0) : log_y[_j * n_channel + t]',
'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
return ret,
def forward_cpu(self, inputs):
x, t = inputs
if chainer.is_debug():
self._check_input_values(x, t)
log_y = log_softmax._log_softmax(x)
if self.cache_score:
self.y = np.exp(log_y)
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape)
log_yd = np.rollaxis(log_y, 1)
log_yd = log_yd.reshape(len(log_yd), -1)
log_p = log_yd[np.maximum(t.ravel(), 0), np.arange(t.size)]
log_p *= (t.ravel() != self.ignore_label)
if self.reduce == 'mean':
# deal with the case where the SoftmaxCrossEntropy is
# unpickled from the old version
if self.normalize:
count = (t != self.ignore_label).sum()
else:
count = len(x)
self._coeff = 1.0 / max(count, 1)
y = log_p.sum(keepdims=True) * (-self._coeff)
return y.reshape(()),
else:
return -log_p.reshape(t.shape),
def backward_cpu(self, inputs, grad_outputs):
x, t = inputs
gloss = grad_outputs[0]
if hasattr(self, 'y'):
y = self.y.copy()
else:
y = log_softmax._log_softmax(x)
np.exp(y, out=y)
if y.ndim == 2:
gx = y
gx[np.arange(len(t)), np.maximum(t, 0)] -= 1
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
c = c[np.arange(len(t)), np.maximum(t, 0)]
gx *= _broadcast_to(np.expand_dims(c, 1), gx.shape)
gx *= (t != self.ignore_label).reshape((len(t), 1))
else:
n_unit = t.size // len(t)
gx = y.reshape(y.shape[0], y.shape[1], -1)
fst_index = np.arange(t.size) // n_unit
trd_index = np.arange(t.size) % n_unit
gx[fst_index, np.maximum(t.ravel(), 0), trd_index] -= 1
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
c = c.reshape(gx.shape)
c = c[fst_index, np.maximum(t.ravel(), 0), trd_index]
c = c.reshape(y.shape[0], 1, -1)
gx *= _broadcast_to(c, gx.shape)
gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
gx = gx.reshape(y.shape)
if self.reduce == 'mean':
gx *= gloss * self._coeff
else:
gx *= gloss[:, None]
return gx, None
def backward_gpu(self, inputs, grad_outputs):
cupy = cuda.cupy
x, t = inputs
if hasattr(self, 'y'):
y = self.y
else:
y = log_softmax._log_softmax(x)
cupy.exp(y, out=y)
gloss = grad_outputs[0]
n_unit = t.size // len(t)
if self.reduce == 'mean':
coeff = gloss * self._coeff
else:
coeff = gloss[:, None, ...]
if self.class_weight is None:
gx = cuda.elementwise(
'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = t == ignore_label ? 0 : coeff * (y - (c == t));
''',
'softmax_crossent_bwd')(
y, cupy.expand_dims(t, 1), coeff, x.shape[1],
n_unit, self.ignore_label)
else:
gx = cuda.elementwise(
'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
'S ignore_label',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
''',
'softmax_crossent_weight_bwd')(
y, self.class_weight, cupy.expand_dims(t, 1), coeff,
x.shape[1], n_unit, self.ignore_label)
return gx, None
def backward(self, inputs, grad_outputs):
e1 = array.as_mat(inputs[0])
e2 = array.as_mat(inputs[1])
W = inputs[2]
gy = grad_outputs[0]
xp = cuda.get_array_module(*inputs)
if xp is numpy:
gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy)
ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy)
ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy)
else:
kern = cuda.reduce('T in0, T in1, T in2', 'T out',
'in0 * in1 * in2', 'a + b', 'out = a', 0,
'bilinear_product')
e1_b = e1[:, :, None, None] # ij
e2_b = e2[:, None, :, None] # ik
gy_b = gy[:, None, None, :] # il
W_b = W[None, :, :, :] # jkl
gW = kern(e1_b, e2_b, gy_b, axis=0) # 'ij,ik,il->jkl'
ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3)) # 'ik,jkl,il->ij'
ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3)) # 'ij,jkl,il->ik'
ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW
if len(inputs) == 6:
V1, V2, b = inputs[3:]
gV1 = e1.T.dot(gy)
gV2 = e2.T.dot(gy)
gb = gy.sum(0)
ge1 += gy.dot(V1.T)
ge2 += gy.dot(V2.T)
ret += gV1, gV2, gb
return ret
def forward_cpu(self, inputs):
x, t = inputs
if chainer.is_debug():
self._check_input_values(x, t)
log_y = log_softmax._log_softmax(x)
if self.cache_score:
self.y = np.exp(log_y)
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in xrange(x.ndim)]
log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape)
log_yd = np.rollaxis(log_y, 1)
log_yd = log_yd.reshape(len(log_yd), -1)
log_p = log_yd[np.maximum(t.ravel(), 0), np.arange(t.size)]
log_p *= (t.ravel() != self.ignore_label)
if self.reduce == 'mean':
# deal with the case where the SoftmaxCrossEntropy is
# unpickled from the old version
if self.normalize:
count = (t != self.ignore_label).sum()
else:
count = len(x)
self._coeff = 1.0 / max(count, 1)
y = log_p.sum(keepdims=True) * (-self._coeff)
return y.reshape(()),
else:
return -log_p.reshape(t.shape),
def backward_cpu(self, inputs, grad_outputs):
x, t = inputs
gloss = grad_outputs[0]
if hasattr(self, 'y'):
y = self.y.copy()
else:
y = log_softmax._log_softmax(x)
np.exp(y, out=y)
if y.ndim == 2:
gx = y
gx[np.arange(len(t)), np.maximum(t, 0)] -= 1
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in xrange(x.ndim)]
c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
c = c[np.arange(len(t)), np.maximum(t, 0)]
gx *= _broadcast_to(np.expand_dims(c, 1), gx.shape)
gx *= (t != self.ignore_label).reshape((len(t), 1))
else:
n_unit = t.size // len(t)
gx = y.reshape(y.shape[0], y.shape[1], -1)
fst_index = np.arange(t.size) // n_unit
trd_index = np.arange(t.size) % n_unit
gx[fst_index, np.maximum(t.ravel(), 0), trd_index] -= 1
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in xrange(x.ndim)]
c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
c = c.reshape(gx.shape)
c = c[fst_index, np.maximum(t.ravel(), 0), trd_index]
c = c.reshape(y.shape[0], 1, -1)
gx *= _broadcast_to(c, gx.shape)
gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
gx = gx.reshape(y.shape)
if self.reduce == 'mean':
gx *= gloss * self._coeff
else:
gx *= gloss[:, None]
return gx, None
def backward_gpu(self, inputs, grad_outputs):
cupy = cuda.cupy
x, t = inputs
if hasattr(self, 'y'):
y = self.y
else:
y = log_softmax._log_softmax(x)
cupy.exp(y, out=y)
gloss = grad_outputs[0]
n_unit = t.size // len(t)
if self.reduce == 'mean':
coeff = gloss * self._coeff
else:
coeff = gloss[:, None, ...]
if self.class_weight is None:
gx = cuda.elementwise(
'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = t == ignore_label ? 0 : coeff * (y - (c == t));
''',
'softmax_crossent_bwd')(
y, cupy.expand_dims(t, 1), coeff, x.shape[1],
n_unit, self.ignore_label)
else:
gx = cuda.elementwise(
'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
'S ignore_label',
'T gx',
'''
const int c = (i / n_unit % n_channel);
gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
''',
'softmax_crossent_weight_bwd')(
y, self.class_weight, cupy.expand_dims(t, 1), coeff,
x.shape[1], n_unit, self.ignore_label)
return gx, None
def forward_gpu(self, inputs):
cupy = cuda.cupy
x, t = inputs
if chainer.is_debug():
self._check_input_values(x, t)
log_y = log_softmax._log_softmax(x)
if self.cache_score:
self.y = cupy.exp(log_y)
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
log_y *= cupy.broadcast_to(
self.class_weight.reshape(shape), x.shape)
if self.normalize:
coeff = cupy.maximum(1, (t != self.ignore_label).sum())
else:
coeff = max(1, len(t))
self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
if self.reduce == 'mean':
ret = cuda.reduce(
'S t, raw T log_y, int32 n_channel, raw T coeff, '
'S ignore_label',
'T out',
't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1],
self._coeff, self.ignore_label)
else:
ret = cuda.elementwise(
'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
'''
if (t == ignore) {
out = 0;
} else {
out = -log_y[i * n_channel + t];
}
''',
'softmax_crossent_no_reduce_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
ret = ret.reshape(t.shape)
return ret,
def forward_gpu(self, inputs):
x, t, W = inputs
max_length = cuda.reduce(
'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]',
'max(a, b)', 'out = a', '0',
'binary_hierarchical_softmax_max_length')(t, self.begins)
max_length = cuda.to_cpu(max_length)[()]
length = max_length * x.shape[0]
ls = cuda.cupy.empty((length,), dtype=numpy.float32)
n_in = x.shape[1]
wxy = cuda.cupy.empty_like(ls)
cuda.elementwise(
'''raw T x, raw T w, raw int32 ts, raw int32 paths,
raw T codes, raw int32 begins, int32 c, int32 max_length''',
'T ls, T wxy',
'''
int ind = i / max_length;
int offset = i - ind * max_length;
int t = ts[ind];
int begin = begins[t];
int length = begins[t + 1] - begins[t];
if (offset < length) {
int p = begin + offset;
int node = paths[p];
T wx = 0;
for (int j = 0; j < c; ++j) {
int w_ind[] = {node, j};
int x_ind[] = {ind, j};
wx += w[w_ind] * x[x_ind];
}
wxy = wx * codes[p];
ls = log(1 + exp(-wxy));
} else {
ls = 0;
}
''',
'binary_hierarchical_softmax_forward'
)(x, W, t, self.paths, self.codes, self.begins, n_in, max_length, ls,
wxy)
self.max_length = max_length
self.wxy = wxy
return ls.sum(),
def forward_gpu(self, inputs):
cupy = cuda.cupy
x, t = inputs
if chainer.is_debug():
self._check_input_values(x, t)
log_y = log_softmax._log_softmax(x)
if self.cache_score:
self.y = cupy.exp(log_y)
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in xrange(x.ndim)]
log_y *= cupy.broadcast_to(
self.class_weight.reshape(shape), x.shape)
if self.normalize:
coeff = cupy.maximum(1, (t != self.ignore_label).sum())
else:
coeff = max(1, len(t))
self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
if self.reduce == 'mean':
ret = cuda.reduce(
'S t, raw T log_y, int32 n_channel, raw T coeff, '
'S ignore_label',
'T out',
't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1],
self._coeff, self.ignore_label)
else:
ret = cuda.elementwise(
'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
'''
if (t == ignore) {
out = 0;
} else {
out = -log_y[i * n_channel + t];
}
''',
'softmax_crossent_no_reduce_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
ret = ret.reshape(t.shape)
return ret,