def ASw_transition_loss_pred(self, i, j, combined_head, combined_dep, transition_logit, SHIFT):
# extract relevant portions of params
rel_trans_feat_ids = self.trans_feat_ids[i*self.args.beam_size+j] if not self.train else self.trans_feat_ids[i, j]
rel_trans_feat_size = self.trans_feat_sizes[i*self.args.beam_size+j] if not self.train else self.trans_feat_sizes[i, j]
# core computations
has_shift = tf.cond(tf.equal(rel_trans_feat_ids[0, 0], SHIFT), lambda: tf.constant(1), lambda: tf.constant(0))
arc_trans_count = rel_trans_feat_size - has_shift
arc_trans_feat_ids = tf.gather(rel_trans_feat_ids, tf.range(has_shift, rel_trans_feat_size))
rel_head = tf.reshape(tf.gather(combined_head, arc_trans_feat_ids[:, 1]), [arc_trans_count, self.args.rel_emb_dim])
rel_dep = tf.reshape(tf.gather(combined_dep, arc_trans_feat_ids[:, 2]), [arc_trans_count, self.args.rel_emb_dim])
rel_hid = self.rel_merge(rel_head, rel_dep)
rel_logit = self.rel_dense(rel_hid)
arc_logit = tf.reshape(rel_logit, [-1])
def logaddexp(a, b):
mx = tf.maximum(a, b)
return tf.log(tf.exp(a-mx) + tf.exp(b-mx)) + mx
if self.train:
# compute a loss and return it
log_partition = tf.reduce_logsumexp(arc_logit)
log_partition = tf.cond(tf.greater(has_shift, 0),
lambda: logaddexp(log_partition, transition_logit[rel_trans_feat_ids[0, 3]]),
lambda: log_partition)
arc_logit = log_partition - arc_logit
res = tf.cond(tf.greater(has_shift, 0),
lambda: tf.cond(tf.greater(self.trans_labels[i, j], 0),
lambda: arc_logit[self.trans_labels[i, j]-1],
lambda: log_partition - transition_logit[rel_trans_feat_ids[0, 3]]),
lambda: arc_logit[self.trans_labels[i, j]])
return res
else:
# just return predictions
arc_logit = tf.reshape(rel_logit, [-1])
log_partition = tf.reduce_logsumexp(arc_logit)
log_partition = tf.cond(tf.greater(has_shift, 0),
lambda: logaddexp(log_partition, transition_logit[rel_trans_feat_ids[0, 3]]),
lambda: log_partition)
arc_logit = log_partition - arc_logit
arc_pred = tf.cond(tf.greater(has_shift, 0),
lambda: tf.concat([tf.reshape(log_partition - transition_logit[rel_trans_feat_ids[0, 3]], (-1,1)),
tf.reshape(arc_logit, (-1,1))], 0),
lambda: tf.reshape(arc_logit, (-1, 1)))
# correct shape
current_output_shape = has_shift + arc_trans_count * rel_logit.get_shape()[1]
arc_pred = tf.concat([arc_pred, 1e20 * tf.ones((tf.subtract(self.pred_output_size, current_output_shape), 1), dtype=tf.float32)], 0)
arc_pred = tf.reshape(arc_pred, [-1])
return arc_pred
python类reduce_logsumexp()的实例源码
def bow_loss_by_example(logits,
targets,
weights,
average_across_timesteps=False):
"""Loss for a bow of logits (per example).
As opposed to sequence loss this is supposed to ignore the order.
Does not seem to work yet.
Args:
logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
targets: List of 1D batch-sized int32 Tensors of the same length as
logits.
weights: List of 1D batch-sized float-Tensors of the same length as
logits.
average_across_timesteps: If set, divide the returned cost by the total
label weight.
Returns:
1D batch-sized float Tensor: The loss for each bow.
Raises:
ValueError: If len(logits) is different from len(targets) or len(weights).
"""
if len(targets) != len(logits) or len(weights) != len(logits):
raise ValueError('Lengths of logits, weights, and targets must be the same '
'%d, %d, %d.' % (len(logits), len(weights), len(targets)))
batch_size = logits[0].shape[0]
vocab_size = logits[0].shape[1]
logitssum = tf.zeros((batch_size, vocab_size), tf.float32)
targetset = tf.zeros((batch_size, vocab_size), tf.float32)
for target, weight in zip(targets, weights):
targetset += (tf.one_hot(target, vocab_size) * weight[:, None])
weight = tf.ones((batch_size), tf.float32)
for logit in logits:
softmax = tf.nn.softmax(logit)
logitssum += (logitssum * weight[:, None])
weight = tf.maximum(0.0, weight - softmax[:, 3])
# logitssum = tf.minimum(logitssum, 1.0)
# targetset = tf.minimum(targetset, 1.0)
# loss = tf.nn.sigmoid_cross_entropy_with_logits(
# labels=targetset, logits=logitssum)
loss = tf.reduce_sum(tf.squared_difference(logitssum, targetset), axis=1)
# crossent = tf.maximum(logitssum, 0.0) - (
# logitssum * targetset) + tf.log(1.0 + tf.exp(-1.0 * tf.abs(logitssum)))
# log_perps = tf.reduce_logsumexp(crossent, axis=1)
if average_across_timesteps:
total_size = tf.add_n(weights)
total_size += 1e-12 # Just to avoid division by 0 for all-0 weights.
loss /= total_size
return loss
def blurred_cross_entropy(output, target, filter_size=11, sampling_range=3.5, pixel_weights=None):
"""
Apply a Gaussian smoothing filter to the target probabilities (i.e. the one-hot
representation of target) and compute the cross entropy loss between softmax(output)
and the blurred target probabilities.
:param output: A rank-4 or rank-5 tensor with shape=(samples, [sequence_position,] x, y, num_classes)
representing the network input of the output layer (not activated)
:param target: A rank-3 or rank-4 tensor with shape=(samples, [sequence_position,] x, y) representing
the target labels. It must contain int values in 0..num_classes-1.
:param filter_size: A length-2 list of int specifying the size of the Gaussian filter that will be
applied to the target probabilities.
:param pixel_weights: A rank-3 or rank-4 tensor with shape=(samples, [sequence_position,] x, y)
representing factors, that will be applied to the loss of the corresponding pixel. This can be
e.g. used to void certain pixels by weighting them to 0, i.e. suppress their error induction.
:return: A scalar operation representing the blurred cross entropy loss.
"""
# convert target to one-hot
output_shape = output.shape.as_list()
one_hot = tf.one_hot(target, output_shape[-1], dtype=tf.float32)
if (len(output_shape) > 4):
one_hot = tf.reshape(one_hot, [np.prod(output_shape[:-3])] + output_shape[-3:])
# blur target probabilities
#gauss_filter = weight_gauss_conv2d(filter_size + [output_shape[-1], 1])
#blurred_target = tf.nn.depthwise_conv2d(one_hot, gauss_filter, [1, 1, 1, 1], 'SAME')
blurred_target = gaussian_blur(one_hot, filter_size, sampling_range)
if (len(output_shape) > 4):
blurred_target = tf.reshape(blurred_target, output_shape)
# compute log softmax predictions and cross entropy
log_pred = output - tf.reduce_logsumexp(output, axis=[len(output_shape) - 1], keep_dims=True)
# Apply pixel-wise weighting
if pixel_weights is not None:
log_pred *= pixel_weights
cross_entropy = -tf.reduce_sum(blurred_target * log_pred, axis=[len(output_shape)-1])
if pixel_weights is not None:
loss = tf.reduce_sum(cross_entropy) / tf.reduce_sum(pixel_weights)
else:
loss = tf.reduce_mean(cross_entropy)
return loss
def build_elbo(self, n_samples, training=False):
cfg = self.config
reuse = False
if training:
reuse = True
z = self.variational.sample(self.data, n_samples=n_samples, reuse=reuse)
log_q_z = self.variational.log_prob(z, reuse=reuse)
self.log_q_z = log_q_z
log_p_x_z = self.model.log_prob(self.data, z, reuse=reuse)
if cfg['optim/deterministic_annealing'] and training:
self.build_magnitude()
tf.summary.scalar('c/magnitude', self.magnitude)
magnitude = tf.maximum(1., self.magnitude)
elbo = log_p_x_z - magnitude * log_q_z
else:
elbo = log_p_x_z - log_q_z
if training:
self.elbo_loss = elbo
_, variance = tf.nn.moments(elbo, [0])
self.elbo_variance = tf.reduce_mean(variance)
self.log_q_z_loss = log_q_z
self.variational.build_entropy(z)
self.q_z_sample = z
slim.summarize_collection('variational')
slim.summarize_collection('model')
slim.summarize_activations('variational')
slim.summarize_activations('model')
else:
self.elbo = elbo
self.log_q_z = log_q_z
self.log_p_x_hat = (tf.reduce_logsumexp(elbo, [0], keep_dims=True) -
tf.log(float(cfg['q/n_samples_stats'])))
tf.summary.scalar('o/log_p_x_hat', tf.reduce_mean(self.log_p_x_hat))
def sum_mean(x): return tf.reduce_sum(tf.reduce_mean(x, 0))
self.elbo_sum = sum_mean(elbo)
self.q_entropy = -sum_mean(log_q_z)
self.E_log_lik = sum_mean(log_p_x_z)
tf.summary.scalar('o/elbo_sum', sum_mean(elbo))
tf.summary.scalar('o/elbo_mean', sum_mean(elbo) / cfg['batch_size'])
tf.summary.scalar('o/E_log_q_z', sum_mean(log_q_z))
tf.summary.scalar('o/E_log_p_x_z', self.E_log_lik)
def predict(self, answer, start_logits, end_logits, mask) -> Prediction:
bound = self.bound
f1_weight = self.f1_weight
aggregate = self.aggregate
masked_logits1 = exp_mask(start_logits, mask)
masked_logits2 = exp_mask(end_logits, mask)
span_logits = []
for i in range(self.bound):
if i == 0:
span_logits.append(masked_logits1 + masked_logits2)
else:
span_logits.append(masked_logits1[:, :-i] + masked_logits2[:, i:])
span_logits = tf.concat(span_logits, axis=1)
l = tf.shape(start_logits)[1]
if len(answer) == 1:
answer = answer[0]
if answer.dtype == tf.int32:
if f1_weight == 0:
answer_ix = to_packed_coordinates(answer, l, bound)
loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(logits=span_logits, labels=answer_ix))
else:
f1_mask = packed_span_f1_mask(answer, l, bound)
if f1_weight < 1:
f1_mask *= f1_weight
f1_mask += (1 - f1_weight) * tf.one_hot(to_packed_coordinates(answer, l, bound), l)
# TODO can we stay in log space? (actually its tricky since f1_mask can have zeros...)
probs = tf.nn.softmax(span_logits)
loss = -tf.reduce_mean(tf.log(tf.reduce_sum(probs * f1_mask, axis=1)))
else:
log_norm = tf.reduce_logsumexp(span_logits, axis=1)
if aggregate == "sum":
log_score = tf.reduce_logsumexp(
span_logits + VERY_NEGATIVE_NUMBER * (1 - tf.cast(answer, tf.float32)),
axis=1)
elif aggregate == "max":
log_score = tf.reduce_max(span_logits + VERY_NEGATIVE_NUMBER * (1 - tf.cast(answer, tf.float32)),
axis=1)
else:
raise NotImplementedError()
loss = tf.reduce_mean(-(log_score - log_norm))
else:
raise NotImplementedError()
tf.add_to_collection(tf.GraphKeys.LOSSES, loss)
return PackedSpanPrediction(span_logits, l, bound)