def svgd_kernel(self, h = -1):
sq_dist = pdist(self.theta)
pairwise_dists = squareform(sq_dist)**2
if h < 0: # if h < 0, using median trick
h = np.median(pairwise_dists)
h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1))
# compute the rbf kernel
Kxy = np.exp( -pairwise_dists / h**2 / 2)
dxkxy = -np.matmul(Kxy, self.theta)
sumkxy = np.sum(Kxy, axis=1)
for i in range(self.theta.shape[1]):
dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy)
dxkxy = dxkxy / (h**2)
return (Kxy, dxkxy)
python类exp()的实例源码
def nll_loss_sharedparams(self, mus, sigmas, corxy, pis, y_true):
mus_ex = mus[np.newaxis, :, :]
X = y_true[:, np.newaxis, :]
diff = X - mus_ex
diffprod = T.prod(diff, axis=-1)
corxy2 = corxy **2
diff2 = diff ** 2
sigmas2 = sigmas ** 2
sigmainvs = 1.0 / sigmas
sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1]
diffsigma = diff2 / sigmas2
diffsigmanorm = T.sum(diffsigma, axis=-1)
z = diffsigmanorm - 2 * corxy * diffprod * sigmainvprods
oneminuscorxy2inv = 1.0 / (1.0 - corxy2)
expterm = -0.5 * z * oneminuscorxy2inv
new_exponent = T.log(0.5/np.pi) + T.log(sigmainvprods) + T.log(np.sqrt(oneminuscorxy2inv)) + expterm + T.log(pis)
max_exponent = T.max(new_exponent ,axis=1, keepdims=True)
mod_exponent = new_exponent - max_exponent
gauss_mix = T.sum(T.exp(mod_exponent),axis=1)
log_gauss = max_exponent + T.log(gauss_mix)
loss = -T.mean(log_gauss)
return loss
def rbf_kernel(X0):
XY = T.dot(X0, X0.transpose())
x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
X2e = T.repeat(x2, X0.shape[0], axis=1)
H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.
Kxy = T.exp(-H / h ** 2 / 2.0)
neighbors = T.argsort(H, axis=1)[:, 1]
return Kxy, neighbors, h
def rbf_kernel(X):
XY = T.dot(X, X.T)
x2 = T.sum(X**2, axis=1).dimshuffle(0, 'x')
X2e = T.repeat(x2, X.shape[0], axis=1)
H = X2e + X2e.T - 2. * XY
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(.5 * h / T.log(H.shape[0].astype('float32') + 1.))
# compute the rbf kernel
kxy = T.exp(-H / (h ** 2) / 2.0)
dxkxy = -T.dot(kxy, X)
sumkxy = T.sum(kxy, axis=1).dimshuffle(0, 'x')
dxkxy = T.add(dxkxy, T.mul(X, sumkxy)) / (h ** 2)
return kxy, dxkxy
def evaluation(self, X_test, y_test):
# normalization
X_test = self.normalization(X_test)
# average over the output
pred_y_test = np.zeros([self.M, len(y_test)])
prob = np.zeros([self.M, len(y_test)])
'''
Since we have M particles, we use a Bayesian view to calculate rmse and log-likelihood
'''
for i in range(self.M):
w1, b1, w2, b2, loggamma, loglambda = self.unpack_weights(self.theta[i, :])
pred_y_test[i, :] = self.nn_predict(X_test, w1, b1, w2, b2) * self.std_y_train + self.mean_y_train
prob[i, :] = np.sqrt(np.exp(loggamma)) /np.sqrt(2*np.pi) * np.exp( -1 * (np.power(pred_y_test[i, :] - y_test, 2) / 2) * np.exp(loggamma) )
pred = np.mean(pred_y_test, axis=0)
# evaluation
svgd_rmse = np.sqrt(np.mean((pred - y_test)**2))
svgd_ll = np.mean(np.log(np.mean(prob, axis = 0)))
return (svgd_rmse, svgd_ll)
def matrix2att(self, matrix):
'''Input is vector of size (batch_size,5) in theano terms'''
g_hat_x = matrix[:,0]
g_hat_y = matrix[:,1]
log_delta = matrix[:,2]
log_sigma_sqr = matrix[:,3]
log_gamma = matrix[:,4]
g_x = (self.A + 1.0) / 2.0 * (g_hat_x + 1.0)
g_y = (self.B + 1.0) / 2.0 * (g_hat_y + 1.0)
delta = (max(self.A,self.B) - 1.0) / (self.N - 1) * T.exp(log_delta)
gamma = T.exp(log_gamma).dimshuffle(0, 'x')
sigma = T.exp(log_sigma_sqr/2.0)
return g_y, g_x, delta, sigma, gamma
def matrix2att_cpu(self, matrix):
'''Input is vector of size (batch_size,5) in numpy terms'''
g_hat_x = matrix[:,0]
g_hat_y = matrix[:,1]
log_delta = matrix[:,2]
log_sigma_sqr = matrix[:,3]
log_gamma = matrix[:,4]
g_x = (self.A + 1.0) / 2.0 * (g_hat_x + 1.0)
g_y = (self.B + 1.0) / 2.0 * (g_hat_y + 1.0)
delta = (max(self.A,self.B) - 1.0) / (self.N - 1) * np.exp(log_delta)
gamma = np.exp(log_gamma)
sigma = np.exp(log_sigma_sqr/2.0)
return g_y, g_x, delta, sigma, gamma
def matrix2att(self, matrix):
'''Input is vector of size (batch_size,5) in theano terms'''
g_hat_x = matrix[:,0]
g_hat_y = matrix[:,1]
log_delta = matrix[:,2]
log_sigma_sqr = matrix[:,3]
log_gamma = matrix[:,4]
g_x = (self.A + 1.0) / 2.0 * (g_hat_x + 1.0)
g_y = (self.B + 1.0) / 2.0 * (g_hat_y + 1.0)
delta = (max(self.A,self.B) - 1.0) / (self.N - 1) * T.exp(log_delta)
gamma = T.exp(log_gamma).dimshuffle(0, 'x')
sigma = T.exp(log_sigma_sqr/2.0)
return g_y, g_x, delta, sigma, gamma
def matrix2att_cpu(self, matrix):
'''Input is vector of size (batch_size,5) in numpy terms'''
g_hat_x = matrix[:,0]
g_hat_y = matrix[:,1]
log_delta = matrix[:,2]
log_sigma_sqr = matrix[:,3]
log_gamma = matrix[:,4]
g_x = (self.A + 1.0) / 2.0 * (g_hat_x + 1.0)
g_y = (self.B + 1.0) / 2.0 * (g_hat_y + 1.0)
delta = (max(self.A,self.B) - 1.0) / (self.N - 1) * np.exp(log_delta)
gamma = np.exp(log_gamma)
sigma = np.exp(log_sigma_sqr/2.0)
return g_y, g_x, delta, sigma, gamma
def iou_loss(p, t):
# print "pass"
tp, tt = p.reshape((p.shape[0], 2, 2)), t.reshape((t.shape[0], 2, 2))
overlaps_t0 = T.maximum(tp[:, 0, :], tt[:, 0, :])
overlaps_t1 = T.minimum(tp[:, 1, :], tt[:, 1, :])
intersection = overlaps_t1 - overlaps_t0
bool_overlap = T.min(intersection, axis=1) > 0
intersection = intersection[:, 0] * intersection[:, 1]
intersection = T.maximum(intersection, np.float32(0.))
dims_p = tp[:, 1, :] - tp[:, 0, :]
areas_p = dims_p[:, 0] * dims_p[:, 1]
dims_t = tt[:, 1, :] - tt[:, 0, :]
areas_t = dims_t[:, 0] * dims_t[:, 1]
union = areas_p + areas_t - intersection
loss = 1. - T.minimum(
T.exp(T.log(T.abs_(intersection)) -
T.log(T.abs_(union) + np.float32(1e-5))),
np.float32(1.)
)
# return loss
return T.mean(loss)
def iou_loss_val(p, t):
tp, tt = p.reshape((p.shape[0], 2, 2)), t.reshape((t.shape[0], 2, 2))
overlaps = np.zeros_like(tp, dtype=np.float32)
overlaps[:, 0, :] = np.maximum(tp[:, 0, :], tt[:, 0, :])
overlaps[:, 1, :] = np.minimum(tp[:, 1, :], tt[:, 1, :])
intersection = overlaps[:, 1, :] - overlaps[:, 0, :]
bool_overlap = np.min(intersection, axis=1) > 0
intersection = intersection[:, 0] * intersection[:, 1]
intersection = np.maximum(intersection, 0.)
# print "bool", bool_overlap
# print "Int", intersection
dims_p = tp[:, 1, :] - tp[:, 0, :]
areas_p = dims_p[:, 0] * dims_p[:, 1]
dims_t = tt[:, 1, :] - tt[:, 0, :]
areas_t = dims_t[:, 0] * dims_t[:, 1]
union = areas_p + areas_t - intersection
# print "un", union
loss = 1. - np.minimum(
np.exp(np.log(np.abs(intersection)) - np.log(np.abs(union) + 1e-5)),
1.
)
# print loss
return np.mean(loss)
def get_output_for(self, input, **kwargs):
"""
Given 2d input find the probability of each input in each of num_units
Diagonal Gaussians using the formula from http://mathworld.wolfram.com/BivariateNormalDistribution.html
"""
#make sure sigma is positive and nonzero softplus(x) (0, +inf)
sigmas = T.nnet.softplus(self.sigmas)
sigmainvs = 1.0 / sigmas
sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1]
sigmas2 = sigmas ** 2
mus = self.mus[np.newaxis, :, :]
X = input[:, np.newaxis, :]
diff = (X - mus) ** 2
diffsigma = diff / sigmas2
diffsigmanorm = T.sum(diffsigma, axis=-1)
expterm = T.exp(-0.5 * diffsigmanorm)
probs = (0.5 / np.pi) * sigmainvprods * expterm
return probs
def log_marginal(self, y, h, py, q):
'''Computes the approximate log marginal.
Uses \log \sum p / q - \log N
Args:
y: T.tensor, target values.
h: T.tensor, latent samples.
py: T.tesnor, conditional density p(y | h)
q: approximate posterior q(h | y)
Returns:
approximate log marginal.
'''
log_py_h = -self.conditional.neg_log_prob(y, py)
log_ph = -self.prior.neg_log_prob(h)
log_qh = -self.posterior.neg_log_prob(h, q)
assert log_py_h.ndim == log_ph.ndim == log_qh.ndim
log_p = log_py_h + log_ph - log_qh
log_p_max = T.max(log_p, axis=0, keepdims=True)
w = T.exp(log_p - log_p_max)
return (T.log(w.mean(axis=0, keepdims=True)) + log_p_max).mean()
def step_free_energy(self, x, beta, *params):
'''Step free energy function.
Args:
x (T.tensor): data sample.
beta (float): beta value for annealing.
*params: theano shared variables.
Returns:
T.tensor: free energy.
'''
W, v_params, h_params = self.split_params(*params)
vis_term = beta * self.v_dist.get_energy_bias(x, *v_params)
x = self.v_dist.scale_for_energy_model(x, *v_params)
hid_act = beta * (T.dot(x, W) + self.h_dist.get_center(*h_params))
fe = -vis_term - T.log(1. + T.exp(hid_act)).sum(axis=1)
return fe
def step_free_energy_h(self, h, beta, *params):
'''Step free energy function for hidden states.
Args:
h (T.tensor): hidden sample.
beta (float): beta value for annealing.
*params: theano shared variables.
Returns:
T.tensor: free energy.
'''
W, v_params, h_params = self.split_params(*params)
hid_term = beta * self.h_dist.get_energy_bias(h, *h_params)
h = self.h_dist.scale_for_energy_model(h, *h_params)
vis_act = beta * (T.dot(h, W.T) + self.v_dist.get_center(*v_params))
fe = -hid_term - T.log(1. + T.exp(vis_act)).sum(axis=1)
return fe
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev):
active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
active_next = T.cast(T.minimum(
T.maximum(
active + 1,
T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
), log_p_curr.shape[0]), 'int32')
common_factor = T.max(log_p_prev[:active])
p_prev = T.exp(log_p_prev[:active] - common_factor)
_p_prev = zeros[:active_next]
# copy over
_p_prev = T.set_subtensor(_p_prev[:active], p_prev)
# previous transitions
_p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
# skip transitions
_p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
updated_log_p_prev = T.log(_p_prev) + common_factor
log_p_next = T.set_subtensor(
zeros[:active_next],
log_p_curr[:active_next] + updated_log_p_prev
)
return active_next, log_p_next
def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
old_means = old_dist_info_vars["mean"]
old_log_stds = old_dist_info_vars["log_std"]
new_means = new_dist_info_vars["mean"]
new_log_stds = new_dist_info_vars["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = TT.exp(old_log_stds)
new_std = TT.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = TT.square(old_means - new_means) + \
TT.square(old_std) - TT.square(new_std)
denominator = 2 * TT.square(new_std) + 1e-8
return TT.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info):
old_means = old_dist_info["mean"]
old_log_stds = old_dist_info["log_std"]
new_means = new_dist_info["mean"]
new_log_stds = new_dist_info["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = np.exp(old_log_stds)
new_std = np.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = np.square(old_means - new_means) + \
np.square(old_std) - np.square(new_std)
denominator = 2 * np.square(new_std) + 1e-8
return np.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def rbf_kernel(X0):
XY = T.dot(X0, X0.transpose())
x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
X2e = T.repeat(x2, X0.shape[0], axis=1)
H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.
Kxy = T.exp(-H / h ** 2 / 2.0)
neighbors = T.argsort(H, axis=1)[:, 1]
return Kxy, neighbors, h
def svgd_gradient(X0):
hidden, _, mse = discrim(X0)
grad = -1.0 * T.grad( mse.sum(), X0)
kxy, neighbors, h = rbf_kernel(hidden) #TODO
coff = T.exp( - T.sum((hidden[neighbors] - hidden)**2, axis=1) / h**2 / 2.0 )
v = coff.dimshuffle(0, 'x') * (-hidden[neighbors] + hidden) / h**2
X1 = X0[neighbors]
hidden1, _, _ = discrim(X1)
dxkxy = T.Lop(hidden1, X1, v)
#svgd_grad = (T.dot(kxy, T.flatten(grad, 2)).reshape(dxkxy.shape) + dxkxy) / T.sum(kxy, axis=1).dimshuffle(0, 'x', 'x', 'x')
svgd_grad = grad + dxkxy / 2.
return grad, svgd_grad, dxkxy
def metropolis_hastings_accept(energy_prev, energy_next, s_rng):
"""
Performs a Metropolis-Hastings accept-reject move.
Parameters
----------
energy_prev: theano vector
Symbolic theano tensor which contains the energy associated with the
configuration at time-step t.
energy_next: theano vector
Symbolic theano tensor which contains the energy associated with the
proposed configuration at time-step t+1.
s_rng: theano.tensor.shared_randomstreams.RandomStreams
Theano shared random stream object used to generate the random number
used in proposal.
Returns
-------
return: boolean
True if move is accepted, False otherwise
"""
ediff = energy_prev - energy_next
return (TT.exp(ediff) - s_rng.uniform(size=energy_prev.shape)) >= 0
def metropolis_hastings_accept(energy_prev, energy_next, s_rng):
"""
Performs a Metropolis-Hastings accept-reject move.
Parameters
----------
energy_prev: theano vector
Symbolic theano tensor which contains the energy associated with the
configuration at time-step t.
energy_next: theano vector
Symbolic theano tensor which contains the energy associated with the
proposed configuration at time-step t+1.
s_rng: theano.tensor.shared_randomstreams.RandomStreams
Theano shared random stream object used to generate the random number
used in proposal.
Returns
-------
return: boolean
True if move is accepted, False otherwise
"""
ediff = energy_prev - energy_next
return (TT.exp(ediff) - s_rng.uniform(size=energy_prev.shape)) >= 0
def apply_activation(self, lin_output, activation):
if activation == 'SIGMOID':
final_output = T.nnet.sigmoid(lin_output)
elif activation == 'TANH':
final_output = T.tanh(lin_output)
elif activation == 'LINEAR':
final_output = lin_output
elif activation == 'ReLU': ## rectifier linear unit
final_output = T.maximum(0.0, lin_output)
elif activation == 'ReSU': ## rectifier smooth unit
final_output = numpy.log(1.0 + numpy.exp(lin_output))
else:
self.logger.critical('the input activation function: %s is not supported right now. Please modify layers.py to support' % (activation))
raise
return final_output
def EGD(cost, params, learning_rate = 0.33, constraint = 1.0):
updates = OrderedDict()
grads = T.grad(cost, params)
U = T.constant(constraint)
#first half of params
rw_pos = T.exp(-learning_rate * U * grads[0])
rb_pos = T.exp(-learning_rate * U * grads[1])
#second half
rw_neg = 1/rw_pos
rb_neg = 1/rb_pos
rs = [rw_pos, rb_pos, rw_neg, rb_neg]
partition = T.sum(params[0]*rs[0]) + T.sum(params[1]*rs[1]) + T.sum(params[2]*rs[2]) + T.sum(params[3]*rs[3])
for param, r in zip(params, rs):
updates[param] = U*param*r/partition
return updates
def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
old_means = old_dist_info_vars["mean"]
old_log_stds = old_dist_info_vars["log_std"]
new_means = new_dist_info_vars["mean"]
new_log_stds = new_dist_info_vars["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = TT.exp(old_log_stds)
new_std = TT.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = TT.square(old_means - new_means) + \
TT.square(old_std) - TT.square(new_std)
denominator = 2 * TT.square(new_std) + 1e-8
return TT.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info):
old_means = old_dist_info["mean"]
old_log_stds = old_dist_info["log_std"]
new_means = new_dist_info["mean"]
new_log_stds = new_dist_info["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = np.exp(old_log_stds)
new_std = np.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = np.square(old_means - new_means) + \
np.square(old_std) - np.square(new_std)
denominator = 2 * np.square(new_std) + 1e-8
return np.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def stable_softmax(y_hat):
"""Calculate softmax and log softmax in numerically stable way
Parameters
----------
y_hat : tensor3 (input_seq_len, num_batch, num_classes+1)
class energies
Return
------
softmax values in normal and log domain
"""
y_hat_safe = y_hat - y_hat.max(axis=2, keepdims=True)
y_hat_safe_exp = T.exp(y_hat_safe)
y_hat_safe_normalizer = y_hat_safe_exp.sum(axis=2, keepdims=True)
log_y_hat_safe_normalizer = T.log(y_hat_safe_normalizer)
y_hat_softmax = y_hat_safe_exp / y_hat_safe_normalizer
log_y_hat_softmax = y_hat_safe - log_y_hat_safe_normalizer
return y_hat_softmax, log_y_hat_softmax
def get_output_for(self, input, **kwargs):
distances = conv_pairwise_distance(input, self.V)
similarities = T.exp(-distances / T.abs_(self.gamma))
norm = T.sum(similarities, 1).reshape((similarities.shape[0], 1, similarities.shape[2], similarities.shape[3]))
membership = similarities / (norm + self.eps)
histogram = T.mean(membership, axis=(2, 3))
if self.spatial_level == 1:
pivot1, pivot2 = membership.shape[2] / 2, membership.shape[3] / 2
h1 = T.mean(membership[:, :, :pivot1, :pivot2], axis=(2, 3))
h2 = T.mean(membership[:, :, :pivot1, pivot2:], axis=(2, 3))
h3 = T.mean(membership[:, :, pivot1:, :pivot2], axis=(2, 3))
h4 = T.mean(membership[:, :, pivot1:, pivot2:], axis=(2, 3))
# Pyramid is not used in the paper
# histogram = T.horizontal_stack(h1, h2, h3, h4)
histogram = T.horizontal_stack(histogram, h1, h2, h3, h4)
return histogram
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev):
active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
active_next = T.cast(T.minimum(
T.maximum(
active + 1,
T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
), log_p_curr.shape[0]), 'int32')
common_factor = T.max(log_p_prev[:active])
p_prev = T.exp(log_p_prev[:active] - common_factor)
_p_prev = zeros[:active_next]
# copy over
_p_prev = T.set_subtensor(_p_prev[:active], p_prev)
# previous transitions
_p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
# skip transitions
_p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
updated_log_p_prev = T.log(_p_prev) + common_factor
log_p_next = T.set_subtensor(
zeros[:active_next],
log_p_curr[:active_next] + updated_log_p_prev
)
return active_next, log_p_next
def theano_logsumexp(x, axis=None):
"""
Compute log(sum(exp(x), axis=axis) in a numerically stable
fashion.
Parameters
----------
x : tensor_like
A Theano tensor (any dimension will do).
axis : int or symbolic integer scalar, or None
Axis over which to perform the summation. `None`, the
default, performs over all axes.
Returns
-------
result : ndarray or scalar
The result of the log(sum(exp(...))) operation.
"""
xmax = x.max(axis=axis, keepdims=True)
xmax_ = x.max(axis=axis)
return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis))