def svgd_kernel(self, h = -1):
sq_dist = pdist(self.theta)
pairwise_dists = squareform(sq_dist)**2
if h < 0: # if h < 0, using median trick
h = np.median(pairwise_dists)
h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1))
# compute the rbf kernel
Kxy = np.exp( -pairwise_dists / h**2 / 2)
dxkxy = -np.matmul(Kxy, self.theta)
sumkxy = np.sum(Kxy, axis=1)
for i in range(self.theta.shape[1]):
dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy)
dxkxy = dxkxy / (h**2)
return (Kxy, dxkxy)
python类sum()的实例源码
def l2normalize(layer, train_scale=True):
W_param = layer.W
s = W_param.get_value().shape
if len(s)==4:
axes_to_sum = (1,2,3)
dimshuffle_args = [0,'x','x','x']
k = s[0]
else:
axes_to_sum = 0
dimshuffle_args = ['x',0]
k = s[1]
layer.W_scale = layer.add_param(lasagne.init.Constant(1.),
(k,), name="W_scale", trainable=train_scale, regularizable=False)
layer.W = W_param * (layer.W_scale/T.sqrt(1e-6 + T.sum(T.square(W_param),axis=axes_to_sum))).dimshuffle(*dimshuffle_args)
return layer
# fully connected layer with weight normalization
def nll_loss_sharedparams(self, mus, sigmas, corxy, pis, y_true):
mus_ex = mus[np.newaxis, :, :]
X = y_true[:, np.newaxis, :]
diff = X - mus_ex
diffprod = T.prod(diff, axis=-1)
corxy2 = corxy **2
diff2 = diff ** 2
sigmas2 = sigmas ** 2
sigmainvs = 1.0 / sigmas
sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1]
diffsigma = diff2 / sigmas2
diffsigmanorm = T.sum(diffsigma, axis=-1)
z = diffsigmanorm - 2 * corxy * diffprod * sigmainvprods
oneminuscorxy2inv = 1.0 / (1.0 - corxy2)
expterm = -0.5 * z * oneminuscorxy2inv
new_exponent = T.log(0.5/np.pi) + T.log(sigmainvprods) + T.log(np.sqrt(oneminuscorxy2inv)) + expterm + T.log(pis)
max_exponent = T.max(new_exponent ,axis=1, keepdims=True)
mod_exponent = new_exponent - max_exponent
gauss_mix = T.sum(T.exp(mod_exponent),axis=1)
log_gauss = max_exponent + T.log(gauss_mix)
loss = -T.mean(log_gauss)
return loss
def rbf_kernel(X0):
XY = T.dot(X0, X0.transpose())
x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
X2e = T.repeat(x2, X0.shape[0], axis=1)
H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.
Kxy = T.exp(-H / h ** 2 / 2.0)
neighbors = T.argsort(H, axis=1)[:, 1]
return Kxy, neighbors, h
def l2normalize(layer, train_scale=True):
W_param = layer.W
s = W_param.get_value().shape
if len(s)==4:
axes_to_sum = (1,2,3)
dimshuffle_args = [0,'x','x','x']
k = s[0]
else:
axes_to_sum = 0
dimshuffle_args = ['x',0]
k = s[1]
layer.W_scale = layer.add_param(lasagne.init.Constant(1.),
(k,), name="W_scale", trainable=train_scale, regularizable=False)
layer.W = W_param * (layer.W_scale/T.sqrt(1e-6 + T.sum(T.square(W_param),axis=axes_to_sum))).dimshuffle(*dimshuffle_args)
return layer
# fully connected layer with weight normalization
def __call__(self, X, w_temp, m_temp):
# input dimensions
# X: (nb_samples, input_dim)
# w_temp: (nb_samples, memory_dim)
# m_temp: (nb_samples, memory_dim, memory_width) ::tensor_memory
key = dot(X, self.W_key, self.b_key) # (nb_samples, memory_width)
shift = self.softmax(
dot(X, self.W_shift, self.b_shift)) # (nb_samples, shift_width)
beta = self.softplus(dot(X, self.W_beta, self.b_beta))[:, None] # (nb_samples, x)
gamma = self.softplus(dot(X, self.W_gama, self.b_gama)) + 1. # (nb_samples,)
gamma = gamma[:, None] # (nb_samples, x)
g = self.sigmoid(dot(X, self.W_g, self.b_g))[:, None] # (nb_samples, x)
signal = [key, shift, beta, gamma, g]
w_c = self.softmax(
beta * cosine_sim2d(key, m_temp)) # (nb_samples, memory_dim) //content-based addressing
w_g = g * w_c + (1 - g) * w_temp # (nb_samples, memory_dim) //history interpolation
w_s = shift_convolve2d(w_g, shift, self.shift_conv) # (nb_samples, memory_dim) //convolutional shift
w_p = w_s ** gamma # (nb_samples, memory_dim) //sharpening
w_t = w_p / T.sum(w_p, axis=1)[:, None] # (nb_samples, memory_dim)
return w_t
def dot(inp, matrix, bias=None):
"""
Decide the right type of dot product depending on the input
arguments
"""
if 'int' in inp.dtype and inp.ndim == 2:
return matrix[inp.flatten()]
elif 'int' in inp.dtype:
return matrix[inp]
elif 'float' in inp.dtype and inp.ndim == 3:
shape0 = inp.shape[0]
shape1 = inp.shape[1]
shape2 = inp.shape[2]
if bias:
return (T.dot(inp.reshape((shape0 * shape1, shape2)), matrix) + bias).reshape((shape0, shape1, matrix.shape[1]))
else:
return T.dot(inp.reshape((shape0 * shape1, shape2)), matrix).reshape((shape0, shape1, matrix.shape[1]))
else:
if bias:
return T.dot(inp, matrix) + bias
else:
return T.dot(inp, matrix)
# Numerically stable log(sum(exp(A))). Can also be used in softmax function.
def dot_2d(k, M, b=None, g=None):
# k: (nb_samples, memory_width)
# M: (nb_samples, memory_dim, memory_width)
# norms of keys and memories
# k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
# M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
k = k[:, None, :] # (nb_samples, 1, memory_width)
value = k * M
if b is not None:
b = b[:, None, :]
value *= b # (nb_samples, memory_dim,)
if g is not None:
g = g[None, None, :]
value *= g
sim = T.sum(value, axis=2)
return sim
def crossentropy(y_pred, y_true, void_labels, one_hot=False):
# Clip predictions
y_pred = T.clip(y_pred, _EPSILON, 1.0 - _EPSILON)
if one_hot:
y_true = T.argmax(y_true, axis=1)
# Create mask
mask = T.ones_like(y_true, dtype=_FLOATX)
for el in void_labels:
mask = T.set_subtensor(mask[T.eq(y_true, el).nonzero()], 0.)
# Modify y_true temporarily
y_true_tmp = y_true * mask
y_true_tmp = y_true_tmp.astype('int32')
# Compute cross-entropy
loss = T.nnet.categorical_crossentropy(y_pred, y_true_tmp)
# Compute masked mean loss
loss *= mask
loss = T.sum(loss) / T.sum(mask)
return loss
def dice_loss(y_pred, y_true, void_class, class_for_dice=1):
'''
Dice loss -- works for only binary classes.
y_pred is a softmax output
y_true is one hot
'''
smooth = 1
y_true_f = T.flatten(y_true[:, class_for_dice, :, :])
y_true_f = T.cast(y_true_f, 'int32')
y_pred_f = T.flatten(y_pred[:, class_for_dice, :, :])
# remove void classes from dice
if len(void_class):
for i in range(len(void_class)):
# get idx of non void classes and remove void classes
# from y_true and y_pred
idxs = T.neq(y_true_f, void_class[i]).nonzero()
y_pred_f = y_pred_f[idxs]
y_true_f = y_true_f[idxs]
intersection = T.sum(y_true_f * y_pred_f)
return -(2.*intersection+smooth) / (T.sum(y_true_f)+T.sum(y_pred_f)+smooth)
def build_objective(model, deterministic=False, epsilon=1e-12):
predictions = nn.layers.get_output(model.l_out, deterministic=deterministic)
targets = T.cast(nn.layers.get_output(model.l_target), 'int32')
enable_targets = nn.layers.get_output(model.l_enable_target)
predictions = T.clip(predictions, epsilon, 1.-epsilon)
sum_of_objectives = 0
unit_ptr = 0
for obj_idx, obj_name in enumerate(order_objectives):
n_classes = len(property_bin_borders[obj_name])
v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets)
# take the mean of the objectives where it matters (enabled targets)
obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx]))
if deterministic:
d_objectives_deterministic[obj_name] = obj_scalar
else:
d_objectives[obj_name] = obj_scalar
sum_of_objectives += T.mean(v_obj)
unit_ptr = unit_ptr+n_classes
return sum_of_objectives
def build_objective(model, deterministic=False, epsilon=1e-12):
predictions = nn.layers.get_output(model.l_out, deterministic=deterministic)
targets = T.cast(nn.layers.get_output(model.l_target), 'int32')
enable_targets = nn.layers.get_output(model.l_enable_target)
predictions = T.clip(predictions, epsilon, 1.-epsilon)
sum_of_objectives = 0
unit_ptr = 0
for obj_idx, obj_name in enumerate(order_objectives):
n_classes = len(property_bin_borders[obj_name])
v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets)
# take the mean of the objectives where it matters (enabled targets)
obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx]))
if deterministic:
d_objectives_deterministic[obj_name] = obj_scalar
else:
d_objectives[obj_name] = obj_scalar
sum_of_objectives += obj_scalar
unit_ptr = unit_ptr+n_classes
return sum_of_objectives
def op_cosine_c(
s_xr_, s_xi_, s_yr_, s_yi_, axis_=-1, keepdims_=True, eps_=1e-7):
'''
cosine between two complex vectors, uses standard complex inner product
Args:
s_xr_: real part of x
s_xi_: imag part of x
s_yr_: real part of y
s_yi_: imag part of y
eps_: small number to prevent divide by zero
'''
s_nrm = s_xr_*s_yr_ + s_xi_*s_yi_
s_nx = T.sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_)
s_ny = T.sum(T.sqr(s_yr_) + T.sqr(s_yi_), axis=axis_, keepdims=keepdims_)
return T.sum(s_nrm, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_)
def sample(self, x, K):
if x.ndim == 1:
x = x.reshape(1, x.shape[0])
hn = self.encode(x)
W = self.params[0]
ww = T.dot(W.T, W)
samples = []
for _ in range(K):
s = hn * (1. - hn)
jj = ww * s.dimshuffle(0, 'x', 1) * s.dimshuffle(0, 1, 'x')
alpha = self.srng.normal(size=hn.shape,
avg=0.,
std=self.sigma,
dtype=theano.config.floatX)
delta = (alpha.dimshuffle(0, 1, 'x')*jj).sum(1)
zn = self.decode(hn + delta)
hn = self.encode(zn)
# zn2 = self.decode(hn)
samples.append(zn.eval())
return samples
def get_cost(aes, l, eye=True):
"""Get the sum of all the reconstruction costs of the AEs.
Input:
aes_in: list. List of all the aes.
l: shared variable or a list of shared variables for the importance
weights.
"""
costs = []
for ae, i in zip(aes, range(len(aes))):
if isinstance(ae, ConvolutionalAutoencoder):
costs.append(l[i] * ae.get_train_cost()[0])
else:
costs.append(l[i] * ae.get_train_cost(face=eye)[0])
cost = None
if costs not in [[], None]:
cost = reduce(lambda x, y: x + y, costs)
return cost
def get_eval_fn(model, in3D=False, use_dice=False):
"""Compile the evaluation function of the model."""
if use_dice:
insec = T.sum(model.trg * model.output, axis=1)
tmp = 1 - 2.0 * insec/(T.sum(model.trg, axis=1) + T.sum(model.output,
axis=1))
error = T.mean(tmp)
else:
error = T.mean(T.mean(T.power(model.output - model.trg, 2), axis=1))
if in3D:
x = T.tensor4('x')
else:
x = T.fmatrix("x")
y = T.fmatrix("y")
theano_arg_vl = [x, y]
output_fn_vl = [error, model.output]
eval_fn = theano.function(
theano_arg_vl, output_fn_vl,
givens={model.x: x,
model.trg: y})
return eval_fn
def __init__(self, input_size, output_size, hidden_sizes, activation = T.nnet.sigmoid):
self.hidden_layers = []
self.params = []
self.input = T.matrix('x')
self.target = T.matrix('y')
for i, layer_size in enumerate(hidden_sizes):
if i == 0:
layer_input_size = input_size
layer_input = self.input
else:
layer_input_size = hidden_sizes[i - 1]
layer_input = self.hidden_layers[-1].output
layer = Layer(layer_input, layer_input_size, layer_size, activation = activation)
self.hidden_layers.append(layer)
self.params.extend(layer.params)
self.output_layer = Layer(self.hidden_layers[-1].output, hidden_sizes[-1], output_size)
self.params.extend(self.output_layer.params)
self.output = self.output_layer.output
self.cost = T.sum((self.output - self.target)**2)
def get_output_for(self, input, **kwargs):
"""
Given 2d input find the probability of each input in each of num_units
Diagonal Gaussians using the formula from http://mathworld.wolfram.com/BivariateNormalDistribution.html
"""
#make sure sigma is positive and nonzero softplus(x) (0, +inf)
sigmas = T.nnet.softplus(self.sigmas)
sigmainvs = 1.0 / sigmas
sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1]
sigmas2 = sigmas ** 2
mus = self.mus[np.newaxis, :, :]
X = input[:, np.newaxis, :]
diff = (X - mus) ** 2
diffsigma = diff / sigmas2
diffsigmanorm = T.sum(diffsigma, axis=-1)
expterm = T.exp(-0.5 * diffsigmanorm)
probs = (0.5 / np.pi) * sigmainvprods * expterm
return probs
def errors(self, y):
"""Return a float representing the number of errors in the minibatch
over the total number of examples of the minibatch ; zero one
loss over the size of the minibatch
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example the
correct label
"""
# check if y has same dimension of y_pred
if y.ndim != self.y_pred.ndim:
raise TypeError(
'y should have the same shape as self.y_pred',
('y', y.type, 'y_pred', self.y_pred.type)
)
# check if y is of the correct datatype
if y.dtype.startswith('int'):
# the T.neq operator returns a vector of 0s and 1s, where 1
# represents a mistake in prediction
return T.mean(T.neq(self.y_pred, y))
else:
return T.sum((y - self.y_pred) ** 2);
def _generate_train_model_function(self, scores):
u = T.lvector('u')
i = T.lvector('i')
j = T.lvector('j')
self.W = theano.shared(numpy.zeros((self._dim)).astype('float32'), name='W');
self.S = theano.shared(scores, name='S');
x_ui = T.dot(self.W, self.S[u,i,:].T);
x_uj = T.dot(self.W, self.S[u,j,:].T);
x_uij = x_ui - x_uj;
obj = T.sum(
T.log(T.nnet.sigmoid(x_uij)).sum() - \
self._lambda_w * 0.5 * (self.W ** 2).sum()
)
cost = -obj
g_cost_W = T.grad(cost=cost, wrt=self.W)
updates = [
(self.W, self.W - self._learning_rate * g_cost_W)
]
self.train_model = theano.function(inputs=[u,i,j], outputs=cost, updates=updates);
def update_opt(self, f, target, inputs, reg_coeff):
self.target = target
self.reg_coeff = reg_coeff
params = target.get_params(trainable=True)
constraint_grads = theano.grad(
f, wrt=params, disconnected_inputs='warn')
xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])
def Hx_plain():
Hx_plain_splits = TT.grad(
TT.sum([TT.sum(g * x)
for g, x in zip(constraint_grads, xs)]),
wrt=params,
disconnected_inputs='warn'
)
return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])
self.opt_fun = ext.lazydict(
f_Hx_plain=lambda: ext.compile_function(
inputs=inputs + xs,
outputs=Hx_plain(),
log_name="f_Hx_plain",
),
)
def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
old_means = old_dist_info_vars["mean"]
old_log_stds = old_dist_info_vars["log_std"]
new_means = new_dist_info_vars["mean"]
new_log_stds = new_dist_info_vars["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = TT.exp(old_log_stds)
new_std = TT.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = TT.square(old_means - new_means) + \
TT.square(old_std) - TT.square(new_std)
denominator = 2 * TT.square(new_std) + 1e-8
return TT.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info):
old_means = old_dist_info["mean"]
old_log_stds = old_dist_info["log_std"]
new_means = new_dist_info["mean"]
new_log_stds = new_dist_info["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = np.exp(old_log_stds)
new_std = np.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = np.square(old_means - new_means) + \
np.square(old_std) - np.square(new_std)
denominator = 2 * np.square(new_std) + 1e-8
return np.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def discrim(X):
current_input = dropout(X, 0.3)
### encoder ###
cv1 = relu(dnn_conv(current_input, aew1, subsample=(1,1), border_mode=(1,1)))
cv2 = relu(batchnorm(dnn_conv(cv1, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2, b=aeb2))
cv3 = relu(batchnorm(dnn_conv(cv2, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3, b=aeb3))
cv4 = relu(batchnorm(dnn_conv(cv3, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4, b=aeb4))
cv5 = relu(batchnorm(dnn_conv(cv4, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5, b=aeb5))
cv6 = relu(batchnorm(dnn_conv(cv5, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6, b=aeb6))
### decoder ###
dv6 = relu(batchnorm(deconv(cv6, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6t, b=aeb6t))
dv5 = relu(batchnorm(deconv(dv6, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5t, b=aeb5t))
dv4 = relu(batchnorm(deconv(dv5, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4t, b=aeb4t))
dv3 = relu(batchnorm(deconv(dv4, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3t, b=aeb3t))
dv2 = relu(batchnorm(deconv(dv3, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2t, b=aeb2t))
dv1 = tanh(deconv(dv2, aew1, subsample=(1,1), border_mode=(1,1)))
rX = dv1
mse = T.sqrt(T.sum(T.abs_(T.flatten(X-rX, 2)),axis=1)) + T.sqrt(T.sum(T.flatten((X-rX)**2, 2), axis=1))
return T.flatten(cv6, 2), rX, mse
def rbf_kernel(X0):
XY = T.dot(X0, X0.transpose())
x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
X2e = T.repeat(x2, X0.shape[0], axis=1)
H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.
Kxy = T.exp(-H / h ** 2 / 2.0)
neighbors = T.argsort(H, axis=1)[:, 1]
return Kxy, neighbors, h
def discrim(X):
current_input = dropout(X, 0.3)
### encoder ###
cv1 = relu(dnn_conv(current_input, aew1, subsample=(1,1), border_mode=(1,1)))
cv2 = relu(batchnorm(dnn_conv(cv1, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2, b=aeb2))
cv3 = relu(batchnorm(dnn_conv(cv2, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3, b=aeb3))
cv4 = relu(batchnorm(dnn_conv(cv3, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4, b=aeb4))
cv5 = relu(batchnorm(dnn_conv(cv4, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5, b=aeb5))
cv6 = relu(batchnorm(dnn_conv(cv5, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6, b=aeb6))
### decoder ###
dv6 = relu(batchnorm(deconv(cv6, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6t, b=aeb6t))
dv5 = relu(batchnorm(deconv(dv6, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5t, b=aeb5t))
dv4 = relu(batchnorm(deconv(dv5, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4t, b=aeb4t))
dv3 = relu(batchnorm(deconv(dv4, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3t, b=aeb3t))
dv2 = relu(batchnorm(deconv(dv3, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2t, b=aeb2t))
dv1 = tanh(deconv(dv2, aew1, subsample=(1,1), border_mode=(1,1)))
rX = dv1
mse = T.sqrt(T.sum(T.abs_(T.flatten(X-rX, 2)),axis=1)) + T.sqrt(T.sum(T.flatten((X-rX)**2, 2), axis=1)) # L1 and L2 loss
return T.flatten(cv6, 2), rX, mse
def svgd_gradient(X0):
hidden, _, mse = discrim(X0)
grad = -1.0 * T.grad( mse.sum(), X0)
kxy, neighbors, h = rbf_kernel(hidden) #TODO
coff = T.exp( - T.sum((hidden[neighbors] - hidden)**2, axis=1) / h**2 / 2.0 )
v = coff.dimshuffle(0, 'x') * (-hidden[neighbors] + hidden) / h**2
X1 = X0[neighbors]
hidden1, _, _ = discrim(X1)
dxkxy = T.Lop(hidden1, X1, v)
#svgd_grad = (T.dot(kxy, T.flatten(grad, 2)).reshape(dxkxy.shape) + dxkxy) / T.sum(kxy, axis=1).dimshuffle(0, 'x', 'x', 'x')
svgd_grad = grad + dxkxy / 2.
return grad, svgd_grad, dxkxy
def get_recon_loss(self, idxs, sent_output):
len_sent, len_doc_batch, n_d = sent_output.shape
recon_layer = self.recon_layer
padding_id = self.padding_id
dropout = self.dropout
# (len(sent)*len(doc)*batch)*n_e
input_flat = idxs.ravel()
true_recon = self.embedding_layer.recon_forward(input_flat)
sent_output = apply_dropout(sent_output, dropout)
pred_recon = recon_layer.forward(sent_output.reshape((len_sent*len_doc_batch, n_d)))
# (len(sent)*len(doc)*batch)
mask = T.cast(T.neq(input_flat, padding_id), theano.config.floatX)
n = T.sum(mask)
loss = T.sum((true_recon - pred_recon) ** 2, axis=1) * mask
loss = T.sum(loss) / n
return loss
def sample_weights(sizeX, sizeY, sparsity, scale, rng):
"""
Initialization that fixes the largest singular value.
"""
sizeX = int(sizeX)
sizeY = int(sizeY)
sparsity = numpy.minimum(sizeY, sparsity)
values = numpy.zeros((sizeX, sizeY), dtype=theano.config.floatX)
for dx in xrange(sizeX):
perm = rng.permutation(sizeY)
new_vals = rng.uniform(low=-scale, high=scale, size=(sparsity,))
vals_norm = numpy.sqrt((new_vals**2).sum())
new_vals = scale*new_vals/vals_norm
values[dx, perm[:sparsity]] = new_vals
_,v,_ = numpy.linalg.svd(values)
values = scale * values/v[0]
return values.astype(theano.config.floatX)
def sample_weights(sizeX, sizeY, sparsity, scale, rng):
"""
Initialization that fixes the largest singular value.
"""
sizeX = int(sizeX)
sizeY = int(sizeY)
sparsity = numpy.minimum(sizeY, sparsity)
values = numpy.zeros((sizeX, sizeY), dtype=theano.config.floatX)
for dx in xrange(sizeX):
perm = rng.permutation(sizeY)
new_vals = rng.uniform(low=-scale, high=scale, size=(sparsity,))
vals_norm = numpy.sqrt((new_vals**2).sum())
new_vals = scale*new_vals/vals_norm
values[dx, perm[:sparsity]] = new_vals
_,v,_ = numpy.linalg.svd(values)
values = scale * values/v[0]
return values.astype(theano.config.floatX)