def softmax_loss(self, antecedent_scores, antecedent_labels):
"""
Computes the value of the loss function using antecedent_scores and antecedent_labels.
Practically standard softmax function.
Args:
antecedent_scores: tf.float64, [num_mentions, max_ant + 1], output of fully-connected network that compute
antecedent scores.
antecedent_labels: True labels for antecedent.
Returns: [num_mentions]
The value of loss function.
"""
gold_scores = antecedent_scores + tf.log(tf.cast(antecedent_labels, tf.float64)) # [num_mentions, max_ant + 1]
marginalized_gold_scores = tf.reduce_logsumexp(gold_scores, [1]) # [num_mentions]
log_norm = tf.reduce_logsumexp(antecedent_scores, [1]) # [num_mentions]
return log_norm - marginalized_gold_scores # [num_mentions]
python类reduce_logsumexp()的实例源码
def log_sum_exp(x, axis=None, keep_dims=False):
"""
Deprecated: Use tf.reduce_logsumexp().
Tensorflow numerically stable log sum of exps across the `axis`.
:param x: A Tensor or numpy array.
:param axis: An int or list or tuple. The dimensions to reduce.
If `None` (the default), reduces all dimensions.
:param keep_dims: Bool. If true, retains reduced dimensions with length 1.
Default to be False.
:return: A Tensor after the computation of log sum exp along given axes of
x.
"""
x = tf.cast(x, dtype=tf.float32)
x_max = tf.reduce_max(x, axis=axis, keep_dims=True)
ret = tf.log(tf.reduce_sum(tf.exp(x - x_max), axis=axis,
keep_dims=True)) + x_max
if not keep_dims:
ret = tf.reduce_sum(ret, axis=axis)
return ret
def _log_prob(self, given):
logits, temperature = self.path_param(self.logits), \
self.path_param(self.temperature)
log_given = tf.log(given)
log_temperature = tf.log(temperature)
n = tf.cast(self.n_categories, self.dtype)
if self._check_numerics:
log_given = tf.check_numerics(log_given, "log(given)")
log_temperature = tf.check_numerics(
log_temperature, "log(temperature)")
temp = logits - temperature * log_given
return tf.lgamma(n) + (n - 1) * log_temperature + \
tf.reduce_sum(temp - log_given, axis=-1) - \
n * tf.reduce_logsumexp(temp, axis=-1)
def traditional_transition_loss_pred(self, i, j, combined_head, combined_dep):
rel_trans_feat_ids = self.trans_feat_ids[i*self.args.beam_size+j] if not self.train else self.trans_feat_ids[i, j]
rel_head = tf.reshape(tf.gather(combined_head, rel_trans_feat_ids[:4]), [4, self.args.rel_emb_dim])
rel_dep = tf.reshape(tf.gather(combined_dep, rel_trans_feat_ids[:4]), [4, self.args.rel_emb_dim])
mask = tf.cast(tf.reshape(tf.greater_equal(rel_trans_feat_ids[:4], 0), [4,1]), tf.float32)
rel_head = tf.multiply(mask, rel_head)
rel_dep = tf.multiply(mask, rel_dep)
rel_hid = self.rel_merge(rel_head, rel_dep)
rel_logit = self.rel_dense(tf.reshape(rel_hid, [1, -1]))
rel_logit = tf.reshape(rel_logit, [-1])
log_partition = tf.reduce_logsumexp(rel_logit)
if self.train:
res = log_partition - rel_logit[self.trans_labels[i, j]]
return res
else:
arc_pred = log_partition - rel_logit
return arc_pred
def pos_loss_pred(self, i, pos_embeddings, pos_logit, NUM_POS, gold_pos, pos_trainables):
if self.args.no_pos:
pos_emb = tf.nn.embedding_lookup(pos_embeddings, gold_pos[i])
if self.train:
return 0, pos_emb
else:
return tf.gather(gold_pos[i], tf.range(1, self.sent_length)), pos_emb
else:
pos_logit = pos_logit[1:]
log_partition = tf.reduce_logsumexp(pos_logit, [1])
pos_pred = tf.exp(pos_logit - tf.reshape(log_partition, (-1, 1)))
pos_emb = tf.concat([tf.reshape(tf.nn.embedding_lookup(pos_embeddings, NUM_POS), (1, -1)),
tf.matmul(pos_pred, pos_trainables)], 0)
if self.train:
loss = tf.reduce_sum(tf.gather(log_partition, tf.range(self.sent_lengths[i]-1))
- tf.gather(tf.reshape(pos_logit, [-1]),
tf.range(self.sent_lengths[i]-1) * NUM_POS
+ tf.gather(gold_pos[i], tf.range(1, self.sent_lengths[i]))))
return loss, pos_emb
else:
return tf.cast(tf.argmax(pos_pred, 1), tf.int32), pos_emb
def optimized_loss(self, targets, logits):
""" Function that computes the loss of a mixture density network
in a way that it handles underflow and overflow and avoids unstable
behaviors """
# Obtain parameters
mixings, sigma, mean = self.logits_to_params(logits)
output_size = tf.cast(tf.shape(targets)[1], tf.float32)
variance = tf.square(sigma)
# Convert expressions into exponent-based terms
mixings_exp = tf.log(mixings)
# By properties of logarithm we can simplify the original expression
# log(x/y) = log(x) - log(y), log(xy) = log(x) + log(y), log(1) = 0
sqrt_exp = - output_size * (0.5 * tf.log(2*np.pi) + tf.log(sigma))
gaussian_exp = -tf.divide(tf.square(targets - mean), 2 * variance)
exponent = mixings_exp + sqrt_exp + gaussian_exp
# Use optimized logsumexp function to control underflow/overflow
return tf.reduce_logsumexp(exponent, axis=1)
def weighted_sum(components, weights, scope=""):
# n: num_components
# b: batch_size
# c: component_size
with tf.name_scope(scope):
weight_is_batched = (weights.get_shape().ndims == 2)
if weight_is_batched:
set_batch_size = tf.shape(weights)[0]
else:
set_batch_size = None
components, is_batched = make_batch_consistent(components, set_batch_size=set_batch_size)
components = tf.pack(components) # [n x b x c]
weight_rank = weights.get_shape().ndims
assert_rank_1_or_2(weight_rank)
if weight_rank == 1:
weights = tf.reshape(weights, [-1,1,1]) # [n x 1 x 1]
elif weight_rank == 2:
weights = tf.expand_dims(tf.transpose(weights, [1, 0]),2) # [n x b x 1]
components += weights
# TODO: change this to tf.reduce_logsumexp when it is relased
w_sum = logsumexp(components, reduction_indices=0) # [b x c]
if not is_batched: w_sum = tf.squeeze(w_sum) # [c]
return w_sum
def tf_parameterize(self, x):
# Flat logits
logits = self.logits.apply(x=x)
# Reshape logits to action shape
shape = (-1,) + self.shape + (self.num_actions,)
logits = tf.reshape(tensor=logits, shape=shape)
# !!!
state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1)
# Softmax for corresponding probabilities
probabilities = tf.nn.softmax(logits=logits, dim=-1)
# Min epsilon probability for numerical stability
probabilities = tf.maximum(x=probabilities, y=util.epsilon)
# "Normalized" logits
logits = tf.log(x=probabilities)
return logits, probabilities, state_value
def get_probs_and_accuracy(preds,O):
"""
helper function. we have a prediction for each MC sample of each observation
in this batch. need to distill the multiple preds from each MC into a single
pred for this observation. also get accuracy. use true probs to get ROC, PR curves in sklearn
"""
all_probs = tf.exp(preds[:,1] - tf.reduce_logsumexp(preds, axis = 1)) #normalize; and drop a dim so only prob of positive case
N = tf.cast(tf.shape(preds)[0]/n_mc_smps,tf.int32) #actual number of observations in preds, collapsing MC samples
#predicted probability per observation; collapse the MC samples
probs = tf.zeros([0]) #store all samples in a list, then concat into tensor at end
#setup tf while loop (have to use this bc loop size is variable)
def cond(i,probs):
return i < N
def body(i,probs):
probs = tf.concat([probs,[tf.reduce_mean(tf.slice(all_probs,[i*n_mc_smps],[n_mc_smps]))]],0)
return i+1,probs
i = tf.constant(0)
i,probs = tf.while_loop(cond,body,loop_vars=[i,probs],shape_invariants=[i.get_shape(),tf.TensorShape([None])])
#compare to truth; just use cutoff of 0.5 for right now to get accuracy
correct_pred = tf.equal(tf.cast(tf.greater(probs,0.5),tf.int32), O)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
return probs,accuracy
def logsumexp(x, axis=None, keepdims=False):
"""Computes log(sum(exp(elements across dimensions of a tensor))).
This function is more numerically stable than log(sum(exp(x))).
It avoids overflows caused by taking the exp of large inputs and
underflows caused by taking the log of small inputs.
# Arguments
x: A tensor or variable.
axis: An integer, the axis to reduce over.
keepdims: A boolean, whether to keep the dimensions or not.
If `keepdims` is `False`, the rank of the tensor is reduced
by 1. If `keepdims` is `True`, the reduced dimension is
retained with length 1.
# Returns
The reduced tensor.
"""
axis = _normalize_axis(axis, ndim(x))
return tf.reduce_logsumexp(x, reduction_indices=axis, keep_dims=keepdims)
def predict_density(self, Xnew, Ynew, num_samples):
Fmean, Fvar = self.build_predict(Xnew, full_cov=False, S=num_samples)
S, N, D = shape_as_list(Fmean)
Ynew = tile_over_samples(Ynew, num_samples)
flat_arrays = [tf.reshape(a, [S*N, -1]) for a in [Fmean, Fvar, Ynew]]
l_flat = self.likelihood.predict_density(*flat_arrays)
l = tf.reshape(l_flat, [S, N, -1])
log_num_samples = tf.log(tf.cast(num_samples, float_type))
return tf.reduce_logsumexp(l - log_num_samples, axis=0)
def logsumexp(x, axis=None):
'''Returns `log(sum(exp(x), axis=axis))` with improved numerical stability.
'''
return tf.reduce_logsumexp(x, axis=[axis])
def _log_prob(self, given):
given = tf.cast(given, self.param_dtype)
given, logits = maybe_explicit_broadcast(
given, self.logits, 'given', 'logits')
normalized_logits = logits - tf.reduce_logsumexp(
logits, axis=-1, keep_dims=True)
n = tf.cast(self.n_experiments, self.param_dtype)
log_p = log_combination(n, given) + \
tf.reduce_sum(given * normalized_logits, -1)
return log_p
def _log_prob(self, given):
logits, temperature = self.path_param(self.logits),\
self.path_param(self.temperature)
n = tf.cast(self.n_categories, self.dtype)
log_temperature = tf.log(temperature)
if self._check_numerics:
log_temperature = tf.check_numerics(
log_temperature, "log(temperature)")
temp = logits - temperature * given
return tf.lgamma(n) + (n - 1) * log_temperature + \
tf.reduce_sum(temp, axis=-1) - \
n * tf.reduce_logsumexp(temp, axis=-1)
stochastic_discrete_mlp_q_function.py 文件源码
项目:rllabplusplus
作者: shaneshixiang
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def init_policy(self):
output_vec = L.get_output(self._output_vec_layer, deterministic=True) / self._c
prob = tf.nn.softmax(output_vec)
max_qval = tf.reduce_logsumexp(output_vec, [1])
self._f_prob = tensor_utils.compile_function([self._obs_layer.input_var], prob)
self._f_max_qvals = tensor_utils.compile_function([self._obs_layer.input_var], max_qval)
self._dist = Categorical(self._n)
def log_prob_from_logits(logits):
return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
def logsumexp(x, axis=None):
'''Returns `log(sum(exp(x), axis=axis))` with improved numerical stability.
'''
return tf.reduce_logsumexp(x, axis=[axis])
def log_prob_from_logits(logits):
return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
def build_vimco_loss(cfg, l, log_q_h):
"""Builds negative VIMCO loss as in the paper.
Reference: Variational Inference for Monte Carlo Objectives, Algorithm 1
https://arxiv.org/abs/1602.06725
"""
k, b = l.get_shape().as_list() # n_samples, batch_size
kf = tf.cast(k, tf.float32)
if cfg['optim/geometric_mean']:
# implicit multi-sample objective (importance-sampled ELBO)
l_logsumexp = tf.reduce_logsumexp(l, [0], keep_dims=True)
L_hat = l_logsumexp - tf.log(kf)
else:
# standard ELBO
L_hat = tf.reduce_mean(l, [0], keep_dims=True)
s = tf.reduce_sum(l, 0, keep_dims=True)
diag_mask = tf.expand_dims(tf.diag(tf.ones([k], dtype=tf.float32)), -1)
off_diag_mask = 1. - diag_mask
diff = tf.expand_dims(s - l, 0) # expand for proper broadcasting
l_i_diag = 1. / (kf - 1.) * diff * diag_mask
l_i_off_diag = off_diag_mask * tf.stack([l] * k)
l_i = l_i_diag + l_i_off_diag
if cfg['optim/geometric_mean']:
L_hat_minus_i = tf.reduce_logsumexp(l_i, [1]) - tf.log(kf)
w = tf.stop_gradient(tf.exp((l - l_logsumexp)))
else:
L_hat_minus_i = tf.reduce_mean(l_i, [1])
w = 1.
local_l = tf.stop_gradient(L_hat - L_hat_minus_i)
if not cfg['optim/geometric_mean']:
# correction factor for multiplying by 1. / (kf - 1.) above
# to verify this, work out 2x2 matrix of samples by hand
local_l = local_l * k
loss = local_l * log_q_h + w * l
return loss / tf.to_float(b)
def softmax_loss(self, antecedent_scores, antecedent_labels):
gold_scores = antecedent_scores + tf.log(tf.to_float(antecedent_labels)) # [num_mentions, max_ant + 1]
marginalized_gold_scores = tf.reduce_logsumexp(gold_scores, [1]) # [num_mentions]
log_norm = tf.reduce_logsumexp(antecedent_scores, [1]) # [num_mentions]
return log_norm - marginalized_gold_scores # [num_mentions]
def segment_logsumexp(xs, segments):
""" Similar tf.segment_sum but compute logsumexp rather then sum """
# Stop gradients following the implementation of tf.reduce_logsumexp
maxs = tf.stop_gradient(tf.reduce_max(xs, axis=1))
segment_maxes = tf.segment_max(maxs, segments)
xs -= tf.expand_dims(tf.gather(segment_maxes, segments), 1)
sums = tf.reduce_sum(tf.exp(xs), axis=1)
return tf.log(tf.segment_sum(sums, segments)) + segment_maxes
def predict(self, answer, start_logits, end_logits, mask) -> Prediction:
masked_start_logits = exp_mask(start_logits, mask)
masked_end_logits = exp_mask(end_logits, mask)
if len(answer) == 1:
# answer span is encoding in a sparse int array
answer_spans = answer[0]
losses1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=masked_start_logits, labels=answer_spans[:, 0])
losses2 = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=masked_end_logits, labels=answer_spans[:, 1])
loss = tf.add_n([tf.reduce_mean(losses1), tf.reduce_mean(losses2)], name="loss")
elif len(answer) == 2 and all(x.dtype == tf.bool for x in answer):
# all correct start/end bounds are marked in a dense bool array
# In this case there might be multiple answer spans, so we need an aggregation strategy
losses = []
for answer_mask, logits in zip(answer, [masked_start_logits, masked_end_logits]):
log_norm = tf.reduce_logsumexp(logits, axis=1)
if self.aggregate == "sum":
log_score = tf.reduce_logsumexp(logits +
VERY_NEGATIVE_NUMBER * (1 - tf.cast(answer_mask, tf.float32)),
axis=1)
elif self.aggregate == "max":
log_score = tf.reduce_max(logits +
VERY_NEGATIVE_NUMBER * (1 - tf.cast(answer_mask, tf.float32)), axis=1)
else:
raise ValueError()
losses.append(tf.reduce_mean(-(log_score - log_norm)))
loss = tf.add_n(losses)
else:
raise NotImplemented()
tf.add_to_collection(tf.GraphKeys.LOSSES, loss)
return BoundaryPrediction(tf.nn.softmax(masked_start_logits),
tf.nn.softmax(masked_end_logits),
masked_start_logits, masked_end_logits, mask)
def predict(self, answer, start_logits, end_logits, mask) -> Prediction:
masked_start_logits = exp_mask(start_logits, mask)
masked_end_logits = exp_mask(end_logits, mask)
batch_dim = tf.shape(start_logits)[0]
if len(answer) == 2 and all(x.dtype == tf.bool for x in answer):
none_logit = tf.get_variable("none-logit", initializer=self.non_init, dtype=tf.float32)
none_logit = tf.tile(tf.expand_dims(none_logit, 0), [batch_dim])
all_logits = tf.reshape(tf.expand_dims(masked_start_logits, 1) +
tf.expand_dims(masked_end_logits, 2),
(batch_dim, -1))
# (batch, (l * l) + 1) logits including the none option
all_logits = tf.concat([all_logits, tf.expand_dims(none_logit, 1)], axis=1)
log_norms = tf.reduce_logsumexp(all_logits, axis=1)
# Now build a "correctness" mask in the same format
correct_mask = tf.logical_and(tf.expand_dims(answer[0], 1), tf.expand_dims(answer[1], 2))
correct_mask = tf.reshape(correct_mask, (batch_dim, -1))
correct_mask = tf.concat([correct_mask, tf.logical_not(tf.reduce_any(answer[0], axis=1, keep_dims=True))],
axis=1)
log_correct = tf.reduce_logsumexp(
all_logits + VERY_NEGATIVE_NUMBER * (1 - tf.cast(correct_mask, tf.float32)), axis=1)
loss = tf.reduce_mean(-(log_correct - log_norms))
probs = tf.nn.softmax(all_logits)
tf.add_to_collection(tf.GraphKeys.LOSSES, loss)
return ConfidencePrediction(probs[:, :-1], masked_start_logits, masked_end_logits,
probs[:, -1], none_logit)
else:
raise NotImplemented()
def logsumexp(v, reduction_indices=None, keep_dims=False):
if float(tf.__version__[:4]) > 0.10: # reduce_logsumexp does not exist below tfv0.11
if isinstance(reduction_indices, int): # due to a bug in tfv0.11
reduction_indices = [reduction_indices]
return handle_inf(
tf.reduce_logsumexp(v,
reduction_indices, # this is a bit fragile. reduction_indices got renamed to axis in tfv0.12
keep_dims=keep_dims)
)
else:
m = tf.reduce_max(v, reduction_indices=reduction_indices, keep_dims=keep_dims)
# Use SMALL_NUMBER to handle v = []
return m + tf.log(tf.reduce_sum(tf.exp(v - m),
reduction_indices=reduction_indices,
keep_dims=keep_dims) + SMALL_NUMBER)
def log_prob_from_logits(logits):
return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
def log_prob_from_logits(logits):
return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
def log_prob_from_logits(logits):
"""Softmax function."""
return logits - tf.reduce_logsumexp(logits, keep_dims=True)
def gm_log_p(params_out, x_target, dim):
""" computes log probability of target in Gaussian mixture with given parameters """
mean_x, cov_x, pi_x_logit = params_out
pi_x = tf.nn.softmax(pi_x_logit)
mean_x = tf.transpose(mean_x, perm=[1, 0, 2])
cov_x = tf.transpose(cov_x, perm=[1, 0, 2])
pi_x = tf.transpose(pi_x, perm=[1, 0])
x_diff = x_target - mean_x
x_square = tf.reduce_sum((x_diff / cov_x) * x_diff, axis=[2])
log_x_exp = -0.5 * x_square
log_cov_x_det = tf.reduce_sum(tf.log(cov_x), axis=[2])
log_x_norm = -0.5 * (dim * tf.log(2 * np.pi) + log_cov_x_det) + pi_x
log_p = tf.reduce_logsumexp(log_x_norm + log_x_exp, axis=[0])
return log_p, log_x_norm, log_x_exp, tf.abs(x_diff)
def log_prob_from_logits(logits):
return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
def lookup(self, symbol):
if symbol == None:
return None
if type(symbol) == type([]):
return [self.lookup(k) for k in symbol]
if type(symbol) == type({}) or type(symbol) == hc.Config:
return hc.Config({k: self.lookup(symbol[k]) for k in symbol.keys()})
if type(symbol) != type(""):
return symbol
if symbol.startswith('function:'):
return self.lookup_function(symbol)
if symbol.startswith('class:'):
return self.lookup_class(symbol)
if symbol == 'tanh':
return tf.nn.tanh
if symbol == 'sigmoid':
return tf.nn.sigmoid
if symbol == 'batch_norm':
return layer_regularizers.batch_norm_1
if symbol == 'layer_norm':
return layer_regularizers.layer_norm_1
if symbol == "crelu":
return tf.nn.crelu
if symbol == "prelu":
return self.prelu()
if symbol == "selu":
return selu
if symbol == "lrelu":
return lrelu
if symbol == "relu":
return tf.nn.relu
if symbol == 'square':
return tf.square
if symbol == 'reduce_mean':
return tf.reduce_mean
if symbol == 'reduce_min':
return tf.reduce_min
if symbol == 'reduce_sum':
return tf.reduce_sum
if symbol == 'reduce_logsumexp':
return tf.reduce_logsumexp
if symbol == 'reduce_linear':
return self.reduce_linear()
if symbol == 'l1_distance':
return l1_distance
if symbol == 'l2_distance':
return l2_distance
return symbol