def __call__(self, ht, xs, d_bar_s_1):
#ht:encoder?????????????????
#batch_size * n_words * in_size
#xs:??????
if d_bar_s_1 == None:
d_bar_s_1 = np.zeros(self.in_size)
ht_T = list(map(F.transpose, ht))
phi_ht = list(map(W1, ht_T))
d_s = rnn(d_bar_s_1, y_s_1)
phi_d = F.transpose_sequence(W2(F.transpose_sequence(d_s)))
u_st = list(map(lambda x: phi_d*x, phi_ht)) #(4)
sum_u = F.sum(u_st)
alpha_st = list(map(lambda x:x/sum_u, u_st)) #(3)
z_s = F.argmax(alpha_st, axis=0)
c_s = F.sum(list(map(lambda x,y:x*y , alpha_st, ht))) #(2)
d_bar_s = F.relu(W3(F.concat([c_s, d_s])))
return d_bar_s, d_s, c_s, z_s
python类transpose()的实例源码
def predict(self, xs):
"""
batch: list of splitted sentences
"""
batchsize = len(xs)
fs = [self.extractor.process(x)[:2] for x in xs]
ws, cs = concat_examples(fs, padding=IGNORE)
cat_ys, dep_ys = self.forward(ws, cs)
cat_ys = F.transpose(F.stack(cat_ys, 2), (0, 2, 1))
# dep_ys = F.transpose(F.stack(dep_ys, 2), (0, 2, 1))
cat_ys = [F.log_softmax(
F.reshape(y, (y.shape[1], -1))[1:len(x) + 1]).data for x, y in \
zip(xs, F.split_axis(cat_ys, batchsize, 0))]
dep_ys = [F.log_softmax(y[1:len(x) + 1, :len(x) + 1]).data \
for x, y in zip(xs, dep_ys)]
assert len(cat_ys) == len(dep_ys)
return zip(cat_ys, dep_ys)
def forward(self, ws, ss, ps):
batchsize, length = ws.shape
xp = chainer.cuda.get_array_module(ws[0])
ws = self.emb_word(ws) # (batch, length, word_dim)
ss = F.reshape(self.emb_suf(ss), (batchsize, length, -1))
ps = F.reshape(self.emb_prf(ps), (batchsize, length, -1))
hs = F.transpose(F.concat([ws, ss, ps], 2), (1, 0, 2))
hs = F.dropout(hs, self.dropout_ratio, train=self.train)
hs = F.split_axis(hs, length, 0)
hs_f = []
hs_b = []
self._init_state()
for h_in_f, h_in_b in zip(hs, reversed(hs)):
h_f = self.lstm_f2(self.lstm_f1(F.squeeze(h_in_f, 0)))
hs_f.append(h_f)
h_b = self.lstm_b2(self.lstm_b1(F.squeeze(h_in_b, 0)))
hs_b.append(h_b)
ys = [self.linear2(F.relu(self.linear1(F.concat([h_f, h_b]))))
for h_f, h_b in zip(hs_f, reversed(hs_b))]
return ys
def predict(self, xs):
"""
batch: list of splitted sentences
"""
batchsize = len(xs)
xs = [self.extractor.process(x) for x in xs]
ws, ss, ps = concat_examples(xs, padding=IGNORE)
cat_ys, dep_ys = self.forward(ws, ss, ps)
cat_ys = F.transpose(F.stack(cat_ys, 2), (0, 2, 1))
dep_ys = F.transpose(F.stack(dep_ys, 2), (0, 2, 1))
cat_ys = [F.squeeze(y, 0).data[1:len(x) + 1] for x, y in \
zip(xs, F.split_axis(cat_ys, batchsize, 0))]
dep_ys = [F.squeeze(F.log_softmax(y[1:len(x) + 1, :-1]), 0).data \
for x, y in zip(xs, F.split_axis(dep_ys, batchsize, 0))]
return cat_ys, dep_ys
def __call__(self, chars):
if not isinstance(chars, (tuple, list)):
chars = [chars]
char_ids, boundaries = self._create_sequence(chars)
x = self.embed(self.xp.array(char_ids))
x = F.dropout(x, self._dropout)
length, dim = x.shape
C = self.conv(F.reshape(x, (1, 1, length, dim)))
# C.shape -> (1, out_size, length, 1)
C = F.split_axis(F.transpose(F.reshape(C, (self.out_size, length))),
boundaries, axis=0)
ys = F.max(F.pad_sequence(
[matrix for i, matrix in enumerate(C) if i % 2 == 1],
padding=-np.inf), axis=1) # max over time pooling
# assert len(chars) == ys.shape[0]
return ys
def __call__(self, x1, x2):
xp = self.xp
out_size = self.out_size
batch_size, len1, dim1 = x1.shape
if not self.nobias[0]:
x1 = F.concat((x1, xp.ones((batch_size, len1, 1),
dtype=xp.float32)), axis=2)
dim1 += 1
len2, dim2 = x2.shape[1:]
if not self.nobias[1]:
x2 = F.concat((x2, xp.ones((batch_size, len2, 1),
dtype=xp.float32)), axis=2)
dim2 += 1
x1_reshaped = F.reshape(x1, (batch_size * len1, dim1))
W_reshaped = F.reshape(F.transpose(self.W, (0, 2, 1)),
(dim1, out_size * dim2))
affine = F.reshape(F.matmul(x1_reshaped, W_reshaped),
(batch_size, len1 * out_size, dim2))
biaffine = F.transpose(
F.reshape(batch_matmul(affine, x2, transb=True),
(batch_size, len1, out_size, len2)),
(0, 1, 3, 2))
if not self.nobias[2]:
biaffine += F.broadcast_to(self.b, biaffine.shape)
return biaffine
def __call__(self, x):
xp = chainer.cuda.get_array_module(x.data)
batchsize = x.shape[0]
if self.train_weights == False and self.initial_T is not None:
self.T.W.data = self.initial_T
M = F.reshape(self.T(x), (-1, self.num_kernels, self.ndim_kernel))
M = F.expand_dims(M, 3)
M_T = F.transpose(M, (3, 1, 2, 0))
M, M_T = F.broadcast(M, M_T)
norm = F.sum(abs(M - M_T), axis=2)
eraser = F.broadcast_to(xp.eye(batchsize, dtype=x.dtype).reshape((batchsize, 1, batchsize)), norm.shape)
c_b = F.exp(-(norm + 1e6 * eraser))
o_b = F.sum(c_b, axis=2)
if self.train_weights == False:
self.initial_T = self.T.W.data
return F.concat((x, o_b), axis=1)
def cross_entropy(self, raw_network_output, target_signal_data):
if isinstance(target_signal_data, Variable):
raise Exception("target_signal_data cannot be Variable")
raw_network_output = self.to_variable(raw_network_output)
target_width = target_signal_data.shape[1]
batchsize = raw_network_output.data.shape[0]
if raw_network_output.data.shape[3] != target_width:
raise Exception("raw_network_output.width != target.width")
# (batchsize * time_step,) <- (batchsize, time_step)
target_signal_data = target_signal_data.reshape((-1,))
target_signal = self.to_variable(target_signal_data)
# (batchsize * time_step, channels) <- (batchsize, channels, 1, time_step)
raw_network_output = F.transpose(raw_network_output, (0, 3, 2, 1))
raw_network_output = F.reshape(raw_network_output, (batchsize * target_width, -1))
loss = F.softmax_cross_entropy(raw_network_output, target_signal)
return loss
def __call__(self, x):
xp = chainer.cuda.get_array_module(x.data)
batchsize = x.shape[0]
if self.train_weights == False and self.initial_T is not None:
self.T.W.data = self.initial_T
M = F.reshape(self.T(x), (-1, self.num_kernels, self.ndim_kernel))
M = F.expand_dims(M, 3)
M_T = F.transpose(M, (3, 1, 2, 0))
M, M_T = F.broadcast(M, M_T)
norm = F.sum(abs(M - M_T), axis=2)
eraser = F.broadcast_to(xp.eye(batchsize, dtype=x.dtype).reshape((batchsize, 1, batchsize)), norm.shape)
c_b = F.exp(-(norm + 1e6 * eraser))
o_b = F.sum(c_b, axis=2)
if self.train_weights == False:
self.initial_T = self.T.W.data
return F.concat((x, o_b), axis=1)
def __call__(self, x):
xp = chainer.cuda.get_array_module(x.data)
batchsize = x.shape[0]
if self.train_weights == False and self.initial_T is not None:
self.T.W.data = self.initial_T
M = F.reshape(self.T(x), (-1, self.num_kernels, self.ndim_kernel))
M = F.expand_dims(M, 3)
M_T = F.transpose(M, (3, 1, 2, 0))
M, M_T = F.broadcast(M, M_T)
norm = F.sum(abs(M - M_T), axis=2)
eraser = F.broadcast_to(xp.eye(batchsize, dtype=x.dtype).reshape((batchsize, 1, batchsize)), norm.shape)
c_b = F.exp(-(norm + 1e6 * eraser))
o_b = F.sum(c_b, axis=2)
if self.train_weights == False:
self.initial_T = self.T.W.data
return F.concat((x, o_b), axis=1)
def reorg(input, stride=2):
batch_size, input_channel, input_height, input_width = input.data.shape
output_height, output_width, output_channel = int(input_height/stride), int(input_width/stride), input_channel*stride*stride
output = F.transpose(F.reshape(input, (batch_size, input_channel, output_height, stride, output_width, stride)), (0, 1, 2, 4, 3, 5))
output = F.transpose(F.reshape(output, (batch_size, input_channel, output_height, output_width, -1)), (0, 4, 1, 2, 3))
output = F.reshape(output, (batch_size, output_channel, output_height, output_width))
return output
def predict(self, input_x):
if isinstance(input_x, chainer.Variable):
device = cuda.get_device(input_x.data)
else:
device = cuda.get_device(input_x)
xp = self.predictor.xp
with device:
output = self.predictor(input_x)
batch_size, input_channel, input_h, input_w = input_x.shape
batch_size, _, grid_h, grid_w = output.shape
x, y, w, h, conf, prob = F.split_axis(F.reshape(output, (batch_size, self.predictor.n_boxes, self.predictor.n_classes+5, grid_h, grid_w)), (1, 2, 3, 4, 5), axis=2)
x = F.sigmoid(x)
y = F.sigmoid(y)
conf = F.sigmoid(conf)
prob = F.transpose(prob, (0, 2, 1, 3, 4))
prob = F.softmax(prob)
prob = F.transpose(prob, (0, 2, 1, 3, 4))
# convert coordinates to those on the image
x_shift = xp.asarray(np.broadcast_to(np.arange(grid_w, dtype=np.float32), x.shape))
y_shift = xp.asarray(np.broadcast_to(np.arange(grid_h, dtype=np.float32).reshape(grid_h, 1), y.shape))
w_anchor = xp.asarray(np.broadcast_to(np.reshape(np.array(self.anchors, dtype=np.float32)[:, 0], (self.predictor.n_boxes, 1, 1, 1)), w.shape))
h_anchor = xp.asarray(np.broadcast_to(np.reshape(np.array(self.anchors, dtype=np.float32)[:, 1], (self.predictor.n_boxes, 1, 1, 1)), h.shape))
box_x = (x + x_shift) / grid_w
box_y = (y + y_shift) / grid_h
box_w = F.exp(w) * w_anchor / grid_w
box_h = F.exp(h) * h_anchor / grid_h
return box_x, box_y, box_w, box_h, conf, prob
def predict(self, input_x):
if isinstance(input_x, chainer.Variable):
device = cuda.get_device(input_x.data)
else:
device = cuda.get_device(input_x)
xp = self.predictor.xp
with device:
output = self.predictor(input_x)
batch_size, input_channel, input_h, input_w = input_x.shape
batch_size, _, grid_h, grid_w = output.shape
x, y, w, h, conf, prob = F.split_axis(F.reshape(output, (batch_size, self.predictor.n_boxes, self.predictor.n_classes+5, grid_h, grid_w)), (1, 2, 3, 4, 5), axis=2)
x = F.sigmoid(x)
y = F.sigmoid(y)
conf = F.sigmoid(conf)
prob = F.transpose(prob, (0, 2, 1, 3, 4))
prob = F.softmax(prob)
prob = F.transpose(prob, (0, 2, 1, 3, 4))
# convert coordinates to those on the image
x_shift = xp.asarray(np.broadcast_to(np.arange(grid_w, dtype=np.float32), x.shape))
y_shift = xp.asarray(np.broadcast_to(np.arange(grid_h, dtype=np.float32).reshape(grid_h, 1), y.shape))
w_anchor = xp.asarray(np.broadcast_to(np.reshape(np.array(self.anchors, dtype=np.float32)[:, 0], (self.predictor.n_boxes, 1, 1, 1)), w.shape))
h_anchor = xp.asarray(np.broadcast_to(np.reshape(np.array(self.anchors, dtype=np.float32)[:, 1], (self.predictor.n_boxes, 1, 1, 1)), h.shape))
box_x = (x + x_shift) / grid_w
box_y = (y + y_shift) / grid_h
box_w = F.exp(w) * w_anchor / grid_w
box_h = F.exp(h) * h_anchor / grid_h
return box_x, box_y, box_w, box_h, conf, prob
def extract_best_label_logits(self, arc_logits, label_logits, lengths):
pred_arcs = self.model.xp.argmax(arc_logits.data, axis=1)
label_logits = F.transpose(label_logits, (0, 2, 1, 3))
label_logits = [_logits[np.arange(_length), _arcs[:_length]]
for _logits, _arcs, _length
in zip(label_logits, pred_arcs, lengths)]
label_logits = F.pad_sequence(label_logits)
return label_logits
def parse(self, pretrained_word_tokens=None,
word_tokens=None, pos_tokens=None):
if word_tokens is not None:
self.forward(pretrained_word_tokens, word_tokens, pos_tokens)
ROOT = self._ROOT_LABEL
arcs_batch, labels_batch = [], []
arc_logits = cuda.to_cpu(self._arc_logits.data)
label_logits = cuda.to_cpu(self._label_logits.data)
for arc_logit, label_logit, length in \
zip(arc_logits, np.transpose(label_logits, (0, 2, 1, 3)),
self._lengths):
arc_probs = softmax2d(arc_logit[:length, :length])
arcs = mst(arc_probs)
label_probs = softmax2d(label_logit[np.arange(length), arcs])
labels = np.argmax(label_probs, axis=1)
labels[0] = ROOT
tokens = np.arange(1, length)
roots = np.where(labels[tokens] == ROOT)[0] + 1
if len(roots) < 1:
root_arc = np.where(arcs[tokens] == 0)[0] + 1
labels[root_arc] = ROOT
elif len(roots) > 1:
label_probs[roots, ROOT] = 0
new_labels = \
np.argmax(label_probs[roots], axis=1)
root_arc = np.where(arcs[tokens] == 0)[0] + 1
labels[roots] = new_labels
labels[root_arc] = ROOT
arcs_batch.append(arcs)
labels_batch.append(labels)
return arcs_batch, labels_batch
def forward(self, ws, cs):
batchsize, length, max_word_len = cs.shape
ws = self.emb_word(ws) # (batch, length, word_dim)
cs = F.reshape(
F.max_pooling_2d(
self.conv_char(
F.reshape(
self.emb_char(cs),
(batchsize * length, 1, max_word_len, 50))), (max_word_len, 1)),
(batchsize, length, self.char_dim))
hs = F.transpose(F.concat([ws, cs], 2), (1, 0, 2))
hs = F.dropout(hs, self.dropout_ratio, train=self.train)
hs = F.split_axis(hs, length, 0)
hs_f = []
hs_b = []
self._init_state()
for h_in_f, h_in_b in zip(hs, reversed(hs)):
h_f = self.lstm_f2(self.lstm_f1(F.reshape(h_in_f, (batchsize, -1))))
hs_f.append(h_f)
h_b = self.lstm_b2(self.lstm_b1(F.reshape(h_in_b, (batchsize, -1))))
hs_b.append(h_b)
hs = [F.concat([h_f, h_b]) for h_f, h_b in zip(hs_f, reversed(hs_b))]
cat_ys = [self.linear_cat2(F.dropout(
F.elu(self.linear_cat1(h)), 0.5, train=self.train)) for h in hs]
hs = [F.reshape(h, (length, -1)) for h in \
F.split_axis(F.transpose(F.stack(hs, 2), (0, 2, 1)), batchsize, 0)]
dep_ys = [self.biaffine(
F.relu(F.dropout(self.linear_dep(h), 0.32, train=self.train)),
F.relu(F.dropout(self.linear_head(h), 0.32, train=self.train))) for h in hs]
return cat_ys, dep_ys
def __call__(self, ws, cs, cat_ts, dep_ts):
batchsize, length = cat_ts.shape
cat_ys, dep_ys = self.forward(ws, cs)
cat_ys = cat_ys[1:-1]
cat_ts = [F.reshape(x, (batchsize,)) for x \
in F.split_axis(F.transpose(cat_ts), length, 0)]
assert len(cat_ys) == len(cat_ts)
cat_loss = reduce(lambda x, y: x + y,
[F.softmax_cross_entropy(y, t) for y, t in zip(cat_ys, cat_ts)])
cat_acc = reduce(lambda x, y: x + y,
[F.accuracy(y, t, ignore_label=IGNORE) for y, t in zip(cat_ys, cat_ts)])
# hs [(length, hidden_dim), ...]
dep_ys = [x[1:-1] for x in dep_ys]
dep_ts = [F.reshape(x, (length,)) for x in F.split_axis(dep_ts, batchsize, 0)]
dep_loss = reduce(lambda x, y: x + y,
[F.softmax_cross_entropy(y, t) for y, t in zip(dep_ys, dep_ts)])
dep_acc = reduce(lambda x, y: x + y,
[F.accuracy(y, t, ignore_label=IGNORE) for y, t in zip(dep_ys, dep_ts)])
cat_acc /= length
dep_acc /= batchsize
chainer.report({
"tagging_loss": cat_loss,
"tagging_accuracy": cat_acc,
"parsing_loss": dep_loss,
"parsing_accuracy": dep_acc
}, self)
return cat_loss + dep_loss
def predict(self, xs):
"""
batch: list of splitted sentences
"""
batchsize = len(xs)
fs = [self.extractor.process(x) for x in xs]
ws, ss, ps = concat_examples(fs, padding=-1)
ys = self.forward(ws, ss, ps)
ys = F.transpose(F.stack(ys, 2), (0, 2, 1))
return [F.squeeze(y, 0).data[1:len(x) + 1] for x, y in \
zip(xs, F.split_axis(ys, batchsize, 0))]
def __call__(self, ws, ss, ps, ts):
"""
xs [(w,s,p,y), ..., ]
w: word, s: suffix, p: prefix, y: label
"""
batchsize, length = ws.shape
cat_ys, dep_ys = self.forward(ws, ss, ps)[1:-1]
cat_ts = [F.reshape(x, (batchsize,)) for x \
in F.split_axis(F.transpose(cat_ts), length, 0)]
dep_ts = [F.reshape(x, (batchsize,)) for x \
in F.split_axis(F.transpose(dep_ts), length, 0)]
cat_loss = reduce(lambda x, y: x + y,
[F.softmax_cross_entropy(y, t) for y, t in zip(cat_ys, cat_ts)])
cat_acc = reduce(lambda x, y: x + y,
[F.accuracy(y, t, ignore_label=IGNORE) for y, t in zip(cat_ys, cat_ts)])
dep_loss = reduce(lambda x, y: x + y,
[F.softmax_cross_entropy(y, t) for y, t in zip(dep_ys, dep_ts)])
dep_acc = reduce(lambda x, y: x + y,
[F.accuracy(y, t, ignore_label=IGNORE) for y, t in zip(dep_ys, dep_ts)])
cat_acc /= length
dep_acc /= length
chainer.report({
"tagging_loss": cat_loss,
"tagging_accuracy": cat_acc,
"parsing_loss": dep_loss,
"parsing_accuracy": dep_acc
}, self)
return cat_loss + dep_loss
def forward(self, ws, ss, ps):
batchsize, length = ws.shape
xp = chainer.cuda.get_array_module(ws[0])
ws = self.emb_word(ws) # (batch, length, word_dim)
ss = F.reshape(self.emb_suf(ss), (batchsize, length, -1))
ps = F.reshape(self.emb_prf(ps), (batchsize, length, -1))
hs = F.transpose(F.concat([ws, ss, ps], 2), (1, 0, 2))
hs = F.dropout(hs, self.dropout_ratio, train=self.train)
hs = F.split_axis(hs, length, 0)
hs_f = []
hs_b = []
self._init_state()
for h_in_f, h_in_b in zip(hs, reversed(hs)):
h_f = self.lstm_f2(self.lstm_f1(F.reshape(h_in_f, (-1, self.in_dim))))
hs_f.append(h_f)
h_b = self.lstm_b2(self.lstm_b1(F.reshape(h_in_b, (-1, self.in_dim))))
hs_b.append(h_b)
hs = zip(hs_f, reversed(hs_b))
cat_ys = [self.linear_cat2(F.dropout(
F.elu(self.linear_cat1(h)), 0.5, train=self.train)) for h in hs]
dep_ys = [self.biaffine(
F.elu(F.dropout(self.linear_dep(h), 0.32, train=self.train)),
F.elu(F.dropout(self.linear_head(h), 0.32, train=self.train))) for h in hs]
return cat_ys, dep_ys
def planar_flows(self,z):
self.z_trans = []
self.z_trans.append(z)
self.phi = []
for i in range(self.num_trans):
flow_w_name = 'flow_w_' + str(i)
flow_b_name = 'flow_b_' + str(i)
flow_u_name = 'flow_u_' + str(i)
h = self[flow_w_name](z)
h = F.sum(h,axis=(1))
h = self[flow_b_name](h)
h = F.tanh(h)
h_tanh = h
dim_latent = z.shape[1]
h = F.transpose(F.tile(h, (dim_latent,1)))
h = self[flow_u_name](h)
z += h
self.z_trans.append(z)
# Calculate and store the phi term
h_tanh_derivative = 1-(h_tanh*h_tanh)
h_tanh_derivative = F.transpose(F.tile(h_tanh_derivative, (dim_latent,1)))
phi = self[flow_w_name](h_tanh_derivative) # Equation (11)
self.phi.append(phi)
return z
def planar_flows(self,z):
self.z_trans = []
self.z_trans.append(z)
self.phi = []
for i in range(self.num_trans):
flow_w_name = 'flow_w_' + str(i)
flow_b_name = 'flow_b_' + str(i)
flow_u_name = 'flow_u_' + str(i)
h = self[flow_w_name](z)
h = F.sum(h,axis=(1))
h = self[flow_b_name](h)
h = F.tanh(h)
h_tanh = h
dim_latent = z.shape[1]
h = F.transpose(F.tile(h, (dim_latent,1)))
h = self[flow_u_name](h)
z += h
self.z_trans.append(z)
# Calculate and store the phi term
h_tanh_derivative = 1-(h_tanh*h_tanh)
h_tanh_derivative = F.transpose(F.tile(h_tanh_derivative, (dim_latent,1)))
phi = self[flow_w_name](h_tanh_derivative) # Equation (11)
self.phi.append(phi)
return z
def __call__(self, x):
return functions.transpose(x, self.axes)
# Noise injections
def check_forward(self, x_data):
axes = self.axes
x = chainer.Variable(x_data)
y = functions.transpose(x, axes)
self.assertEqual(y.data.dtype, self.dtype)
self.assertTrue((self.x.transpose(axes) == cuda.to_cpu(y.data)).all())
def check_backward(self, x_data):
x = chainer.Variable(x_data)
y = functions.transpose(x, self.axes)
y.grad = y.data
y.backward()
gradient_check.assert_allclose(x.data, x.grad, atol=0, rtol=0)
def angular_mc_loss(f, f_p, alpha=45, in_degree=True):
'''
Args:
f (chainer.Variable or xp.npdarray):
Anchor vectors. Each vectors in f must be l2 normalized.
f_p (chainer.Variable or xp.npdarray):
Positive vectors. Each vectors in f must be l2 normalized.
'''
xp = cuda.get_array_module(f)
if in_degree:
alpha = np.deg2rad(alpha)
sq_tan_alpha = np.tan(alpha) ** 2
n_pairs = len(f)
# first and second term of f_{a,p,n}
term1 = 4 * sq_tan_alpha + matmul(f + f_p, transpose(f_p))
term2 = 2 * (1 + sq_tan_alpha) * F.sum(f * f_p, axis=1, keepdims=True)
# term2 = 2 * (1 + sq_tan_alpha) * F.batch_matmul(f, f_p, transa=True).reshape(n_pairs, 1)
f_apn = term1 - F.broadcast_to(term2, (n_pairs, n_pairs))
# multiply zero to diagonal components of f_apn
mask = xp.ones_like(f_apn.data) - xp.eye(n_pairs, dtype=f.dtype)
f_apn = f_apn * mask
return F.average(F.logsumexp(f_apn, axis=1))
def n_pair_mc_loss(f, f_p, l2_reg):
"""Multi-class N-pair loss (N-pair-mc loss) function.
Args:
f (~chainer.Variable): Feature vectors.
All examples must be different classes each other.
f_p (~chainer.Variable): Positive examples corresponding to f.
Each example must be the same class for each example in f.
l2_reg (~float): A weight of L2 regularization for feature vectors.
Returns:
~chainer.Variable: Loss value.
See: `Improved Deep Metric Learning with Multi-class N-pair Loss \
Objective <https://papers.nips.cc/paper/6200-improved-deep-metric-\
learning-with-multi-class-n-pair-loss-objective>`_
"""
logit = matmul(f, transpose(f_p))
N = len(logit.data)
xp = cuda.get_array_module(logit.data)
loss_sce = softmax_cross_entropy(logit, xp.arange(N))
l2_loss = sum(batch_l2_norm_squared(f) +
batch_l2_norm_squared(f_p)) / (2.0 * N)
loss = loss_sce + l2_reg * l2_loss
return loss
def sentence_block_embed(embed, x):
batch, length = x.shape
e = embed(x.reshape((batch * length, )))
# (batch * length, units)
e = F.transpose(F.stack(F.split_axis(e, batch, axis=0), axis=0), (0, 2, 1))
# (batch, units, length)
return e
def seq_linear(linear, x):
batch, units, length, _ = x.shape
h = linear(F.transpose(x, (0, 2, 1, 3)).reshape(batch * length, units))
return F.transpose(h.reshape((batch, length, units, 1)), (0, 2, 1, 3))
def __call__(self, x):
return functions.transpose(x, self.axes)
# Noise injections