def listmle(x, t):
"""
The ListMLE loss as in Xia et al (2008), Listwise Approach to Learning to
Rank - Theory and Algorithm.
:param x: The activation of the previous layer
:param t: The target labels
:return: The loss
"""
# Get the ground truth by sorting activations by the relevance labels
xp = cuda.get_array_module(t)
t_hat = t[:, 0]
x_hat = x[xp.flip(xp.argsort(t_hat), axis=0)]
# Compute MLE loss
final = logcumsumexp(x_hat)
return F.sum(final - x_hat)
python类get_array_module()的实例源码
def bound_by_tanh(x, low, high):
"""Bound a given value into [low, high] by tanh.
Args:
x (chainer.Variable): value to bound
low (numpy.ndarray): lower bound
high (numpy.ndarray): upper bound
Returns: chainer.Variable
"""
assert isinstance(x, chainer.Variable)
assert low is not None
assert high is not None
xp = cuda.get_array_module(x.data)
x_scale = (high - low) / 2
x_scale = xp.expand_dims(xp.asarray(x_scale), axis=0)
x_mean = (high + low) / 2
x_mean = xp.expand_dims(xp.asarray(x_mean), axis=0)
return F.tanh(x) * x_scale + x_mean
def check_forward(self, x_data):
xp = cuda.get_array_module(x_data)
y = maximum_entropy_mellowmax(x_data)
self.assertEqual(y.data.dtype, self.dtype)
print('y', y.data)
# Outputs must be positive
xp.testing.assert_array_less(xp.zeros_like(y.data), y.data)
# Sums must be ones
sums = xp.sum(y.data, axis=1)
testing.assert_allclose(sums, xp.ones_like(sums))
# Expectations must be equal to memllowmax's outputs
testing.assert_allclose(
xp.sum(y.data * x_data, axis=1), mellowmax(x_data, axis=1).data)
def bbox_transform(ex_rois, gt_rois):
xp = get_array_module(ex_rois)
ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = xp.log(gt_widths / ex_widths)
targets_dh = xp.log(gt_heights / ex_heights)
targets = xp.vstack(
(targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
return targets
def forward_gpu(self, inputs):
w, = inputs
xp = cuda.get_array_module(w)
och, ich, _, ny, nx = w.shape
nto, nti = self.T.shape[:2]
rotated_w = xp.empty((och, nto, ich, nti, ny, nx), dtype=w.dtype)
index_group_func_kernel(
input=w,
T=self.T,
U=self.U,
V=self.V,
output=rotated_w
)
return rotated_w,
def backward_gpu(self, inputs, grad_output):
w, = inputs
grad_rotated_w, = grad_output
xp = cuda.get_array_module(w)
# Gradient must be initialized with zeros,
# because the kernel accumulates the gradient instead of overwriting it
grad_w = xp.zeros_like(w)
grad_index_group_func_kernel(
grad_output=grad_rotated_w,
T=self.T,
U=self.U,
V=self.V,
grad_input=grad_w
)
return grad_w,
def nearest_neighbor_patch(x, patch, patch_norm):
assert patch.data.shape[0] == 1, 'mini batch size of patch must be 1'
assert patch_norm.data.shape[0] == 1, 'mini batch size of patch_norm must be 1'
xp = cuda.get_array_module(x.data)
z = x.data
b, ch, h, w = z.shape
z = z.transpose((1, 0, 2, 3)).reshape((ch, -1))
norm = xp.expand_dims(xp.sum(z ** 2, axis=0) ** 0.5, 0)
z = z / xp.broadcast_to(norm, z.shape)
p = patch.data
p_norm = patch_norm.data
p = p.reshape((ch, -1))
p_norm = p_norm.reshape((1, -1))
p_normalized = p / xp.broadcast_to(p_norm, p.shape)
correlation = z.T.dot(p_normalized)
min_index = xp.argmax(correlation, axis=1)
nearest_neighbor = p.take(min_index, axis=1).reshape((ch, b, h, w)).transpose((1, 0, 2, 3))
return Variable(nearest_neighbor)
def update_stats(self, iteration, batchsizes, augmentation=None):
# stack = None
for i in range(iteration):
batch, bucket_idx, piece_id = self.reader.sample_minibatch(batchsizes)
audio_features, sentences, max_feature_length, max_sentence_length = self.extract_batch_features(batch, augmentation=augmentation)
x_batch, x_length_batch, t_batch, t_length_batch, bigram_batch = self.processor.features_to_minibatch(audio_features, sentences, max_feature_length, max_sentence_length, self.token_ids, self.id_blank)
# xp = cuda.get_array_module(x_batch)
for x, length in zip(x_batch, x_length_batch):
# if stack is None:
# stack = x[..., :length]
# else:
# stack = xp.concatenate((stack, x[..., :length]), axis=2)
self._update_stats_recursively(x[..., :length])
# x_mean, x_std = self.get_mean_and_std()
# true_mean = np.mean(stack, axis=2)
# true_std = np.std(stack, axis=2)
# print(xp.mean(abs(true_mean - x_mean), axis=(0, 2)))
# print(xp.mean(abs(true_std - x_std), axis=(0, 2)))
def _backward_sum(gy, in_shape):
xp = cuda.get_array_module(gy)
sum_axis = (1, 2)
keepdims = True
if not (len(in_shape) == 0 or sum_axis is None or keepdims):
actual_axis = []
for axis in sum_axis:
if axis < 0:
axis += len(in_shape)
actual_axis.append(axis)
for axis in sorted(actual_axis):
gy = xp.expand_dims(gy, axis=axis)
if hasattr(xp, 'broadcast_to'):
gx = xp.broadcast_to(gy, in_shape)
else:
# NumPy 1.9 does not support broadcast_to.
dummy_x = xp.empty(in_shape, 'b')
gx, _ = xp.broadcast_arrays(gy, dummy_x)
return gx
def forward(self, xs, eps=1e-6):
self.retain_inputs(())
self.eps = eps
x = xs[0]
self.x_shape = x.shape
self.x_dtype = x.dtype
xp = cuda.get_array_module(x)
size = x.shape[1] * x.shape[2]
self.x_size = size
mean = xp.mean(x, axis=(1, 2), keepdims=True)
self.broadcast_shape = mean.shape
self.diff = x - mean
std = xp.sqrt(xp.sum(self.diff ** 2, axis=(1, 2), keepdims=True) / size)
# std = xp.std(x, axis=(1, 2), keepdims=True)
self.std = std
return self.diff / std,
def backward_cpu(self, inputs, grad_outputs):
x, V, g = inputs[:3]
b = inputs[3] if len(inputs) == 4 else None
if b is None:
gb = None
gx, gW = super(Convolution2DFunction, self).backward_cpu((x, self.W), grad_outputs)
else:
gx, gW, gb = super(Convolution2DFunction, self).backward_cpu((x, self.W, b), grad_outputs)
xp = cuda.get_array_module(x)
gg = xp.sum(gW * self.V_normalized, axis=(1, 2, 3), keepdims=True).astype(g.dtype, copy=False)
gV = g * (gW - gg * self.V_normalized) / self.norm
gV = gV.astype(V.dtype, copy=False)
if b is None:
return gx, gV, gg
else:
return gx, gV, gg, gb
def _initialize_params(self, t):
xp = cuda.get_array_module(t)
# ???????????????????
mean_t = xp.mean(t, axis=(0, 2, 3)).reshape(1, -1, 1, 1)
std_t = xp.sqrt(xp.var(t, axis=(0, 2, 3))).reshape(1, -1, 1, 1)
g = 1 / std_t
b = -mean_t / std_t
# print "g <- {}, b <- {}".format(g.reshape((-1,)), b.reshape((-1,)))
with self.init_scope():
if self.nobias == False:
self.b = variable.Parameter(b.reshape((-1,)))
self.g = variable.Parameter(g.reshape((self.out_channels, 1, 1, 1)))
return mean_t, std_t
def backward(self, inputs, grad_output):
xp = cuda.get_array_module(inputs[0])
batch_size = len(inputs[2])
total_probability = _logsumexp(self.prob_trans[0], xp, axis=1)
label_prob = _compute_label_probability(self.yseq.shape[2], self.path, self.path_length, self.prob_trans, xp, self.zero_padding)
self.yseq -= xp.exp(label_prob - total_probability[:, None])
if self.reduce == 'mean':
self.yseq *= grad_output[0] / batch_size
else:
self.yseq *= grad_output[0][..., None]
# mask
self.yseq *= (xp.arange(len(self.yseq))[:, None] < self.input_length)[..., None]
return (None, None, None, None) + tuple([y for y in self.yseq])
# xs????
def backward(self, inputs, grad_outputs):
x, V, g = inputs[:3]
if hasattr(self, "W") == False:
self.norm = _get_norm(V)
self.V_normalized = V / self.norm
self.W = g * self.V_normalized
b = inputs[3] if len(inputs) == 4 else None
if b is None:
gx, gW = super(Convolution1DFunction, self).backward((x, self.W), grad_outputs)
else:
gx, gW, gb = super(Convolution1DFunction, self).backward((x, self.W, b), grad_outputs)
xp = cuda.get_array_module(x)
gg = xp.sum(gW * self.V_normalized, axis=(1, 2), keepdims=True)
gV = g * (gW - gg * self.V_normalized) / self.norm
if b is None:
return gx, gV, gg
else:
return gx, gV, gg, gb
def _initialize_params(self, t):
xp = cuda.get_array_module(t)
self.mean_t = xp.mean(t, axis=(0, 2)) # calculate average for each channel
self.std_t = xp.sqrt(xp.var(t, axis=(0, 2))) # calculate stddev for each channel
g = 1 / self.std_t
b = -self.mean_t / self.std_t
# print("g <- {}, b <- {}".format(g.reshape((-1,)), b.reshape((-1,))))
with self.init_scope():
if self.nobias == False:
self.b = Parameter(b, b.shape)
g_shape = (self.out_channels, 1) + (1,) * len(self.ksize)
self.g = Parameter(g.reshape(g_shape), g_shape)
def __call__(self, array):
xp = cuda.get_array_module(array)
if not array.shape: # 0-dim case
array[...] = self.scale
elif not array.size:
raise ValueError('Array to be initialized must be non-empty.')
else:
# numpy.prod returns float value when the argument is empty.
flat_shape = (len(array), int(numpy.prod(array.shape[1:])))
if flat_shape[0] > flat_shape[1]:
raise ValueError('Cannot make orthogonal system because'
' # of vectors ({}) is larger than'
' that of dimensions ({})'.format(
flat_shape[0], flat_shape[1]))
a = numpy.random.normal(size=flat_shape)
# we do not have cupy.linalg.svd for now
u, _, v = numpy.linalg.svd(a, full_matrices=False)
# pick the one with the correct shape
q = u if u.shape == flat_shape else v
array[...] = xp.asarray(q.reshape(array.shape))
array *= self.scale
def forward(self, inputs):
xp = cuda.get_array_module(*inputs)
y, t = inputs
if self.ignore_label is not None:
mask = (t == self.ignore_label)
ignore_cnt = mask.sum()
# will always be true when the true label is ignore_label
# TODO(henry0312)
# If cupy.where returns indexes, we could make the code better.
# Also, we would need Advanced Indexing.
pred = xp.where(mask, self.ignore_label,
y.argmax(axis=1).reshape(t.shape))
count = (pred == t).sum() - ignore_cnt
total = t.size - ignore_cnt
if total == 0:
return xp.asarray(0.0, dtype=y.dtype),
else:
return xp.asarray(float(count) / total, dtype=y.dtype),
else:
pred = y.argmax(axis=1).reshape(t.shape)
return xp.asarray((pred == t).mean(dtype=y.dtype)),
def forward(self, inputs):
x, t = inputs
if chainer.is_debug():
if not ((0 <= t).all() and
(t < x.shape[1]).all()):
msg = 'Each label `t` need to satisfty `0 <= t < x.shape[1]`'
raise ValueError(msg)
xp = cuda.get_array_module(x)
if xp is numpy:
# This code is equivalent to `t.choose(x.T)`, but `numpy.choose`
# does not work when `x.shape[1] > 32`.
return x[six.moves.range(t.size), t],
else:
y = cuda.elementwise(
'S t, raw T x',
'T y',
'int ind[] = {i, t}; y = x[ind];',
'getitem_fwd'
)(t, x)
return y,
def _backward_one(x, g):
if g is None:
xp = cuda.get_array_module(x)
return xp.zeros_like(x)
if g.ndim != x.ndim:
g = g.sum(axis=tuple(range(g.ndim - x.ndim)))
# An input variable is always an array, not a scalar.
# We need to convert a scalar value to a zero-dim array.
xp = cuda.get_array_module(x)
if xp.isscalar(g):
g = xp.array(g)
axis = tuple(i for i, sx in enumerate(x.shape) if sx == 1)
if len(axis) > 0:
return g.sum(keepdims=True, axis=axis)
else:
return g
def forward(self, xs):
x = xs[0]
xp = cuda.get_array_module(x)
if (xp != numpy and cuda.cudnn_enabled and self.use_cudnn and
_cudnn_version >= 3000):
oz_dtype = 'd' if x.dtype == 'd' else 'f'
one = numpy.array(1, dtype=oz_dtype).ctypes
zero = numpy.array(0, dtype=oz_dtype).ctypes
handle = cudnn.get_handle()
x_cube = x.reshape(x.shape[:2] + (-1, 1))
desc = cudnn.create_tensor_descriptor(x_cube)
self.y = xp.empty_like(x)
libcudnn.softmaxForward(
handle, _algorithm, _mode, one.data, desc.value,
x_cube.data.ptr, zero.data, desc.value,
self.y.data.ptr)
return self.y,
else:
log_z = logsumexp(x)
self.y = x - log_z
return self.y,
def backward(self, x, gy):
xp = cuda.get_array_module(*x)
if (xp != numpy and cuda.cudnn_enabled and self.use_cudnn and
_cudnn_version >= 3000):
oz_dtype = 'd' if x[0].dtype == 'd' else 'f'
one = numpy.array(1, dtype=oz_dtype).ctypes
zero = numpy.array(0, dtype=oz_dtype).ctypes
handle = cudnn.get_handle()
gx = xp.empty_like(x[0])
gx_cube = gx.reshape(gx.shape[:2] + (-1, 1))
desc = cudnn.create_tensor_descriptor(gx_cube)
libcudnn.softmaxBackward(
handle, _algorithm, _mode, one.data, desc.value,
self.y.data.ptr, desc.value, gy[0].data.ptr, zero.data,
desc.value, gx.data.ptr)
else:
gx = gy[0] - xp.exp(self.y) * gy[0].sum(axis=1, keepdims=True)
return gx,
def forward(self, x):
xp = cuda.get_array_module(*x)
if (xp != numpy and cuda.cudnn_enabled and self.use_cudnn and
(_cudnn_version >= 3000 or x[0].dtype != numpy.float16)):
oz_dtype = 'd' if x[0].dtype == 'd' else 'f'
one = numpy.array(1, dtype=oz_dtype).ctypes
zero = numpy.array(0, dtype=oz_dtype).ctypes
handle = cudnn.get_handle()
x_cube = x[0].reshape(x[0].shape[:2] + (-1, 1))
desc = cudnn.create_tensor_descriptor(x_cube)
self.y = xp.empty_like(x[0])
libcudnn.softmaxForward(
handle, _algorithm, _mode, one.data, desc.value,
x_cube.data.ptr, zero.data, desc.value,
self.y.data.ptr)
else:
self.y = x[0] - x[0].max(axis=1, keepdims=True)
xp.exp(self.y, out=self.y)
self.y /= self.y.sum(axis=1, keepdims=True)
return self.y,
def backward(self, x, gy):
xp = cuda.get_array_module(*x)
if (xp != numpy and cuda.cudnn_enabled and self.use_cudnn and
(_cudnn_version >= 3000 or x[0].dtype != numpy.float16)):
oz_dtype = 'd' if x[0].dtype == 'd' else 'f'
one = numpy.array(1, dtype=oz_dtype).ctypes
zero = numpy.array(0, dtype=oz_dtype).ctypes
handle = cudnn.get_handle()
gx = xp.empty_like(x[0])
gx_cube = gx.reshape(gx.shape[:2] + (-1, 1))
desc = cudnn.create_tensor_descriptor(gx_cube)
libcudnn.softmaxBackward(
handle, _algorithm, _mode, one.data, desc.value,
self.y.data.ptr, desc.value, gy[0].data.ptr, zero.data,
desc.value, gx.data.ptr)
else:
gx = self.y * gy[0]
sumdx = gx.sum(axis=1, keepdims=True)
gx -= self.y * sumdx
return gx,
def forward(self, inputs):
x, W = inputs
if chainer.is_debug():
if not ((0 <= x).all() and
(x < len(W)).all()):
msg = 'Each `x` value need to satisfty `0 <= x < len(W)`'
raise ValueError(msg)
if self.ignore_label is not None:
xp = cuda.get_array_module(*inputs)
mask = (x == self.ignore_label)
return xp.where(
mask[..., None], 0, W.take(xp.where(mask, 0, x), axis=0)),
return W.take(x, axis=0),
def forward(self, inputs):
xp = cuda.get_array_module(inputs[0])
self.input_length = inputs[0]
# The length of path is (2 * label_length + 1)
self.path_length = 2 * inputs[1] + 1
batch_size = len(inputs[2])
yseq_shape = (len(inputs) - 3,) + inputs[3].shape
self.yseq = _softmax(xp.vstack(inputs[3::]).reshape(yseq_shape), xp)
log_yseq = self.log_matrix(self.yseq, xp)
self.path = _label_to_path(inputs[2], self.blank_symbol, xp)
self.prob_trans = self.calc_trans(self.path, log_yseq, xp)
loss = utils.force_array(xp.sum(
_logsumexp(self.prob_trans[0], xp, axis=1)))
loss /= -batch_size
return loss,
def backward(self, inputs, gy):
xp = cuda.get_array_module(*inputs)
x0, x1, y = inputs
x_dim = x0.shape[1]
y = xp.repeat(y[:, None], x_dim, axis=1)
alpha = gy[0] / y.shape[0]
dist = xp.repeat(self.dist[:, None], x_dim, axis=1)
# similar pair
gx0 = alpha * y * self.diff
# dissimilar pair
mdist = xp.repeat(self.mdist[:, None], x_dim, axis=1)
mdist_p = xp.array(mdist > 0, dtype=xp.int32)
gx0 += alpha * (1 - y) * mdist_p * mdist * -(self.diff / dist)
gx0 = gx0.astype(xp.float32)
return gx0, -gx0, None
def backward(self, x, gy):
xp = cuda.get_array_module(*x)
gx = xp.empty_like(x[0])
if self.axis is None:
gx[:] = gy[0]
else:
gy = gy[0]
actual_axis = []
for axis in self.axis:
if axis < 0:
axis = len(gx.shape) + axis
actual_axis.append(axis)
for axis in sorted(actual_axis):
gy = xp.expand_dims(gy, axis=axis)
gx[:] = gy
return gx,
def check_forward(self, t_data, xs_data, l_length, x_length):
x = tuple(chainer.Variable(x_data) for x_data in xs_data)
t = chainer.Variable(t_data)
args = (x, t, self.blank_symbol)
if self.use_length:
args += (chainer.Variable(x_length), chainer.Variable(l_length))
loss = functions.connectionist_temporal_classification(*args)
loss_value = float(loss.data)
# compute expected value by recursive computation.
xp = cuda.get_array_module(self.x)
xt = xp.swapaxes(self.x, 0, 1)
for b in range(xt.shape[0]):
for t in range(xt.shape[1]):
xt[b][t] = numpy.exp(xt[b][t]) / numpy.sum(numpy.exp(xt[b][t]))
loss_expect = 0
batch_size = xt.shape[0]
path_length = 2 * l_length + 1
for xtb, lb, xlb, plb in zip(xt, self.l, x_length, path_length):
loss_expect += -math.log(
self.alpha(xtb, lb, int(xlb - 1), int(plb - 1)) +
self.alpha(xtb, lb, int(xlb - 1), int(plb - 2)))
loss_expect /= batch_size
self.assertAlmostEqual(loss_expect, loss_value, places=5)
def check_backward(self, x_data, W_data, b_data, y_grad):
xp = cuda.get_array_module(x_data)
if not self.c_contiguous:
x_data = xp.asfortranarray(x_data)
W_data = xp.asfortranarray(W_data)
y_grad = xp.asfortranarray(y_grad)
self.assertFalse(x_data.flags.c_contiguous)
self.assertFalse(W_data.flags.c_contiguous)
self.assertFalse(y_grad.flags.c_contiguous)
if b_data is not None:
b = xp.empty((len(b_data) * 2,), dtype=self.b.dtype)
b[::2] = b_data
b_data = b[::2]
self.assertFalse(b_data.flags.c_contiguous)
args = (x_data, W_data)
if b_data is not None:
args = args + (b_data,)
gradient_check.check_backward(
convolution_2d.Convolution2DFunction(
self.stride, self.pad, self.use_cudnn, self.cover_all),
args, y_grad, eps=1e-2)
def check_backward(self, x_data, W_data, b_data, y_grad):
xp = cuda.get_array_module(x_data)
if not self.c_contiguous:
x_data = xp.asfortranarray(x_data)
W_data = xp.asfortranarray(W_data)
y_grad = xp.asfortranarray(y_grad)
self.assertFalse(x_data.flags.c_contiguous)
self.assertFalse(W_data.flags.c_contiguous)
self.assertFalse(y_grad.flags.c_contiguous)
if b_data is not None:
b = xp.empty((len(b_data) * 2,), dtype=self.b.dtype)
b[::2] = b_data
b_data = b[::2]
self.assertFalse(b_data.flags.c_contiguous)
args = (x_data, W_data)
if b_data is not None:
args = args + (b_data,)
gradient_check.check_backward(
deconvolution_2d.Deconvolution2DFunction(
self.stride, self.pad, self.outsize, self.use_cudnn),
args, y_grad, eps=1e-2)