def decov_loss(xs, name='decov_loss'):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
Args:
xs: 4-D `tensor` [batch_size, height, width, channels], input
Returns:
a `float` decov loss
"""
with tf.name_scope(name):
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x - m, 2)
corr = tf.reduce_mean(tf.matmul(z, tf.transpose(z, perm=[0, 2, 1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5 * (corr_frob_sqr - corr_diag_sqr)
return loss
python类diag_part()的实例源码
def _mix_rbf_kernel(X, Y, sigmas, wts=None):
if wts is None:
wts = [1] * len(sigmas)
XX = tf.matmul(X, X, transpose_b=True)
XY = tf.matmul(X, Y, transpose_b=True)
YY = tf.matmul(Y, Y, transpose_b=True)
X_sqnorms = tf.diag_part(XX)
Y_sqnorms = tf.diag_part(YY)
r = lambda x: tf.expand_dims(x, 0)
c = lambda x: tf.expand_dims(x, 1)
K_XX, K_XY, K_YY = 0, 0, 0
for sigma, wt in zip(sigmas, wts):
gamma = 1 / (2 * sigma**2)
K_XX += wt * tf.exp(-gamma * (-2 * XX + c(X_sqnorms) + r(X_sqnorms)))
K_XY += wt * tf.exp(-gamma * (-2 * XY + c(X_sqnorms) + r(Y_sqnorms)))
K_YY += wt * tf.exp(-gamma * (-2 * YY + c(Y_sqnorms) + r(Y_sqnorms)))
return K_XX, K_XY, K_YY, tf.reduce_sum(wts)
def _mix_rbf_kernel(X, Y, sigmas, wts=None):
if wts is None:
wts = [1] * len(sigmas)
XX = tf.matmul(X, X, transpose_b=True)
XY = tf.matmul(X, Y, transpose_b=True)
YY = tf.matmul(Y, Y, transpose_b=True)
X_sqnorms = tf.diag_part(XX)
Y_sqnorms = tf.diag_part(YY)
r = lambda x: tf.expand_dims(x, 0)
c = lambda x: tf.expand_dims(x, 1)
K_XX, K_XY, K_YY = 0, 0, 0
for sigma, wt in zip(sigmas, wts):
gamma = 1 / (2 * sigma**2)
K_XX += wt * tf.exp(-gamma * (-2 * XX + c(X_sqnorms) + r(X_sqnorms)))
K_XY += wt * tf.exp(-gamma * (-2 * XY + c(X_sqnorms) + r(Y_sqnorms)))
K_YY += wt * tf.exp(-gamma * (-2 * YY + c(Y_sqnorms) + r(Y_sqnorms)))
return K_XX, K_XY, K_YY, tf.reduce_sum(wts)
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x-m, 2)
corr = tf.reduce_mean(tf.matmul(z, tf.transpose(z, perm=[0,2,1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5*(corr_frob_sqr - corr_diag_sqr)
return loss
def build_graph(self, goal, critic):
self.ph_stc_diff_st =\
graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_stc_diff_st")
s_diff_normalized = tf.nn.l2_normalize(self.ph_stc_diff_st.node, dim=1)
cosine_similarity = tf.matmul(s_diff_normalized, goal.node, transpose_b=True)
cosine_similarity = tf.diag_part(cosine_similarity)
# manager's advantage (R-V): R = ri + cfg.wGAMMA * R; AdvM = R - ViM
self.ph_discounted_reward =\
graph.Placeholder(np.float32, shape=(None,), name="ph_m_discounted_reward")
advantage = self.ph_discounted_reward.node - critic.node
manager_loss = tf.reduce_sum(advantage * cosine_similarity)
return manager_loss
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x-m, 2)
corr = tf.reduce_mean(tf.matmul(z, tf.transpose(z, perm=[0,2,1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5*(corr_frob_sqr - corr_diag_sqr)
return loss
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x-m, 2)
corr = tf.reduce_mean(tf.matmul(z, tf.transpose(z, perm=[0,2,1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5*(corr_frob_sqr - corr_diag_sqr)
return loss
def test(self):
with self.test_context() as session:
for k in self.kernels:
k.initialize(session=session, force=True)
X = tf.placeholder(tf.float64, [30, self.dim])
rng = np.random.RandomState(1)
X_data = rng.randn(30, self.dim)
k1 = k.Kdiag(X)
k2 = tf.diag_part(k.K(X))
k1, k2 = session.run([k1, k2], feed_dict={X: X_data})
self.assertTrue(np.allclose(k1, k2))
def compute_upper_bound(self):
num_data = tf.cast(tf.shape(self.Y)[0], settings.float_type)
Kdiag = self.kern.Kdiag(self.X)
Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level)
Kuf = self.feature.Kuf(self.kern, self.X)
L = tf.cholesky(Kuu)
LB = tf.cholesky(Kuu + self.likelihood.variance ** -1.0 * tf.matmul(Kuf, Kuf, transpose_b=True))
LinvKuf = tf.matrix_triangular_solve(L, Kuf, lower=True)
# Using the Trace bound, from Titsias' presentation
c = tf.reduce_sum(Kdiag) - tf.reduce_sum(LinvKuf ** 2.0)
# Kff = self.kern.K(self.X)
# Qff = tf.matmul(Kuf, LinvKuf, transpose_a=True)
# Alternative bound on max eigenval:
# c = tf.reduce_max(tf.reduce_sum(tf.abs(Kff - Qff), 0))
corrected_noise = self.likelihood.variance + c
const = -0.5 * num_data * tf.log(2 * np.pi * self.likelihood.variance)
logdet = tf.reduce_sum(tf.log(tf.diag_part(L))) - tf.reduce_sum(tf.log(tf.diag_part(LB)))
LC = tf.cholesky(Kuu + corrected_noise ** -1.0 * tf.matmul(Kuf, Kuf, transpose_b=True))
v = tf.matrix_triangular_solve(LC, corrected_noise ** -1.0 * tf.matmul(Kuf, self.Y), lower=True)
quad = -0.5 * corrected_noise ** -1.0 * tf.reduce_sum(self.Y ** 2.0) + 0.5 * tf.reduce_sum(v ** 2.0)
return const + logdet + quad
def _create_variables(self, data, initial_means=None):
"""Initializes GMM algorithm.
Args:
data: a list of Tensors with data, each row is a new example.
initial_means: a Tensor with a matrix of means.
"""
first_shard = data[0]
# Initialize means: num_classes X 1 X dimensions.
if initial_means is not None:
self._means = tf.Variable(tf.expand_dims(initial_means, 1),
name=self.CLUSTERS_VARIABLE,
validate_shape=False, dtype=tf.float32)
else:
# Sample data randomly
self._means = tf.Variable(tf.expand_dims(
_init_clusters_random(data, self._num_classes, self._random_seed), 1),
name=self.CLUSTERS_VARIABLE,
validate_shape=False)
# Initialize covariances.
if self._covariance_type == FULL_COVARIANCE:
cov = _covariance(first_shard, False) + self._min_var
# A matrix per class, num_classes X dimensions X dimensions
covs = tf.tile(
tf.expand_dims(cov, 0), [self._num_classes, 1, 1])
elif self._covariance_type == DIAG_COVARIANCE:
cov = _covariance(first_shard, True) + self._min_var
# A diagonal per row, num_classes X dimensions.
covs = tf.tile(tf.expand_dims(tf.diag_part(cov), 0),
[self._num_classes, 1])
self._covs = tf.Variable(covs, name='clusters_covs', validate_shape=False)
# Mixture weights, representing the probability that a randomly
# selected unobservable data (in EM terms) was generated by component k.
self._alpha = tf.Variable(tf.tile([1.0 / self._num_classes],
[self._num_classes]))
def _create_variables(self, data, initial_means=None):
"""Initializes GMM algorithm.
Args:
data: a list of Tensors with data, each row is a new example.
initial_means: a Tensor with a matrix of means.
"""
first_shard = data[0]
# Initialize means: num_classes X 1 X dimensions.
if initial_means is not None:
self._means = tf.Variable(tf.expand_dims(initial_means, 1),
name=self.CLUSTERS_VARIABLE,
validate_shape=False, dtype=tf.float32)
else:
# Sample data randomly
self._means = tf.Variable(tf.expand_dims(
_init_clusters_random(data, self._num_classes, self._random_seed), 1),
name=self.CLUSTERS_VARIABLE,
validate_shape=False)
# Initialize covariances.
if self._covariance_type == FULL_COVARIANCE:
cov = _covariance(first_shard, False) + self._min_var
# A matrix per class, num_classes X dimensions X dimensions
covs = tf.tile(
tf.expand_dims(cov, 0), [self._num_classes, 1, 1])
elif self._covariance_type == DIAG_COVARIANCE:
cov = _covariance(first_shard, True) + self._min_var
# A diagonal per row, num_classes X dimensions.
covs = tf.tile(tf.expand_dims(tf.diag_part(cov), 0),
[self._num_classes, 1])
self._covs = tf.Variable(covs, name='clusters_covs', validate_shape=False)
# Mixture weights, representing the probability that a randomly
# selected unobservable data (in EM terms) was generated by component k.
self._alpha = tf.Variable(tf.tile([1.0 / self._num_classes],
[self._num_classes]))
def dot(self, X, Y, name='dot_op'):
with tf.name_scope(name) as scope:
dot_op = tf.diag_part(tf.matmul(X, Y, transpose_b=True))
return dot_op
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x-m, 2)
corr = tf.reduce_mean(tf.matmul(z, tf.transpose(z, perm=[0,2,1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5*(corr_frob_sqr - corr_diag_sqr)
return loss
def _assemble_graph(self):
self._create_placeholders()
tf.set_random_seed(self._random_seed + 1)
A_var = tf.Variable(
initial_value=tf.random_uniform(
shape=[self._emb_dim, self._vocab_dim],
minval=-1, maxval=1, seed=(self._random_seed + 2)
)
)
B_var = tf.Variable(
initial_value=tf.random_uniform(
shape=[self._emb_dim, self._vocab_dim],
minval=-1, maxval=1, seed=(self._random_seed + 3)
)
)
self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
cont_mult = tf.transpose(tf.matmul(A_var, tf.transpose(self.context_batch)))
resp_mult = tf.matmul(B_var, tf.transpose(self.response_batch))
neg_resp_mult = tf.matmul(B_var, tf.transpose(self.neg_response_batch))
pos_raw_f = tf.diag_part(tf.matmul(cont_mult, resp_mult))
neg_raw_f = tf.diag_part(tf.matmul(cont_mult, neg_resp_mult))
self.f_pos = pos_raw_f
self.f_neg = neg_raw_f
self.loss = tf.reduce_sum(tf.nn.relu(self.f_neg - self.f_pos + self._margin))
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x-m, 2)
corr = tf.reduce_mean(tf.batch_matmul(z, tf.transpose(z, perm=[0,2,1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5*(corr_frob_sqr - corr_diag_sqr)
return loss
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x-m, 2)
corr = tf.reduce_mean(tf.batch_matmul(z, tf.transpose(z, perm=[0,2,1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5*(corr_frob_sqr - corr_diag_sqr)
return loss
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x - m, 2)
corr = tf.reduce_mean(tf.batch_matmul(z, tf.transpose(z, perm=[0, 2, 1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5 * (corr_frob_sqr - corr_diag_sqr)
return loss
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x - m, 2)
corr = tf.reduce_mean(tf.matmul(z, tf.transpose(z, perm=[0, 2, 1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5 * (corr_frob_sqr - corr_diag_sqr)
return loss
def log_cholesky_det(chol):
return 2 * tf.reduce_sum(tf.log(tf.diag_part(chol)))
def decov_loss(xs):
"""Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf
'Reducing Overfitting In Deep Networks by Decorrelating Representation'
"""
x = tf.reshape(xs, [int(xs.get_shape()[0]), -1])
m = tf.reduce_mean(x, 0, True)
z = tf.expand_dims(x-m, 2)
corr = tf.reduce_mean(tf.matmul(z, tf.transpose(z, perm=[0,2,1])), 0)
corr_frob_sqr = tf.reduce_sum(tf.square(corr))
corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr)))
loss = 0.5*(corr_frob_sqr - corr_diag_sqr)
return loss
def _assemble_graph(self):
self._create_placeholders()
tf.set_random_seed(self._random_seed + 1)
A_var = tf.Variable(
initial_value=tf.random_uniform(
shape=[self._emb_dim, self._vocab_dim],
minval=-1, maxval=1, seed=(self._random_seed + 2)
)
)
B_var = tf.Variable(
initial_value=tf.random_uniform(
shape=[self._emb_dim, self._vocab_dim],
minval=-1, maxval=1, seed=(self._random_seed + 3)
)
)
self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
cont_mult = tf.transpose(tf.matmul(A_var, tf.transpose(self.context_batch)))
resp_mult = tf.matmul(B_var, tf.transpose(self.response_batch))
neg_resp_mult = tf.matmul(B_var, tf.transpose(self.neg_response_batch))
pos_raw_f = tf.diag_part(tf.matmul(cont_mult, resp_mult))
neg_raw_f = tf.diag_part(tf.matmul(cont_mult, neg_resp_mult))
self.f_pos = pos_raw_f
self.f_neg = neg_raw_f
self.loss = tf.reduce_sum(tf.nn.relu(self.f_neg - self.f_pos + self._margin))
def test_DiagPart(self):
t = tf.diag_part(self.random(3, 3))
self.check(t)
def _mmd2_and_variance(K_XX, K_XY, K_YY, const_diagonal=False, biased=False):
m = tf.cast(K_XX.get_shape()[0], tf.float32) # Assumes X, Y are same shape
### Get the various sums of kernels that we'll use
# Kts drop the diagonal, but we don't need to compute them explicitly
if const_diagonal is not False:
const_diagonal = tf.cast(const_diagonal, tf.float32)
diag_X = diag_Y = const_diagonal
sum_diag_X = sum_diag_Y = m * const_diagonal
sum_diag2_X = sum_diag2_Y = m * const_diagonal**2
else:
diag_X = tf.diag_part(K_XX)
diag_Y = tf.diag_part(K_YY)
sum_diag_X = tf.reduce_sum(diag_X)
sum_diag_Y = tf.reduce_sum(diag_Y)
sum_diag2_X = sq_sum(diag_X)
sum_diag2_Y = sq_sum(diag_Y)
Kt_XX_sums = tf.reduce_sum(K_XX, 1) - diag_X
Kt_YY_sums = tf.reduce_sum(K_YY, 1) - diag_Y
K_XY_sums_0 = tf.reduce_sum(K_XY, 0)
K_XY_sums_1 = tf.reduce_sum(K_XY, 1)
Kt_XX_sum = tf.reduce_sum(Kt_XX_sums)
Kt_YY_sum = tf.reduce_sum(Kt_YY_sums)
K_XY_sum = tf.reduce_sum(K_XY_sums_0)
Kt_XX_2_sum = sq_sum(K_XX) - sum_diag2_X
Kt_YY_2_sum = sq_sum(K_YY) - sum_diag2_Y
K_XY_2_sum = sq_sum(K_XY)
if biased:
mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * m)
+ (Kt_YY_sum + sum_diag_Y) / (m * m)
- 2 * K_XY_sum / (m * m))
else:
mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * (m-1))
+ (Kt_YY_sum + sum_diag_Y) / (m * (m-1))
- 2 * K_XY_sum / (m * m))
var_est = (
2 / (m**2 * (m-1)**2) * (
2 * sq_sum(Kt_XX_sums) - Kt_XX_2_sum
+ 2 * sq_sum(Kt_YY_sums) - Kt_YY_2_sum)
- (4*m-6) / (m**3 * (m-1)**3) * (Kt_XX_sum**2 + Kt_YY_sum**2)
+ 4*(m-2) / (m**3 * (m-1)**2) * (
sq_sum(K_XY_sums_1) + sq_sum(K_XY_sums_0))
- 4 * (m-3) / (m**3 * (m-1)**2) * K_XY_2_sum
- (8*m - 12) / (m**5 * (m-1)) * K_XY_sum**2
+ 8 / (m**3 * (m-1)) * (
1/m * (Kt_XX_sum + Kt_YY_sum) * K_XY_sum
- dot(Kt_XX_sums, K_XY_sums_1)
- dot(Kt_YY_sums, K_XY_sums_0))
)
return mmd2, var_est
def _define_maximization_operation(self, num_batches):
"""Maximization operations."""
# TODO(xavigonzalvo): some of these operations could be moved to C++.
# Compute the effective number of data points assigned to component k.
with tf.control_dependencies(self._w):
points_in_k = tf.squeeze(tf.add_n(self._points_in_k), squeeze_dims=[0])
# Update alpha.
if 'w' in self._params:
final_points_in_k = points_in_k / num_batches
num_examples = tf.to_float(tf.reduce_sum(final_points_in_k))
self._alpha_op = self._alpha.assign(
final_points_in_k / (num_examples + MEPS))
else:
self._alpha_op = tf.no_op()
self._train_ops = [self._alpha_op]
# Update means.
points_in_k_expanded = tf.reshape(points_in_k,
[self._num_classes, 1, 1])
if 'm' in self._params:
self._means_op = self._means.assign(
tf.div(tf.add_n(self._w_mul_x), points_in_k_expanded + MEPS))
else:
self._means_op = tf.no_op()
# means are (num_classes x 1 x dims)
# Update covariances.
with tf.control_dependencies([self._means_op]):
b = tf.add_n(self._w_mul_x2) / (points_in_k_expanded + MEPS)
new_covs = []
for k in range(self._num_classes):
mean = self._means.ref()[k, :, :]
square_mean = tf.matmul(mean, mean, transpose_a=True)
new_cov = b[k, :, :] - square_mean + self._min_var
if self._covariance_type == FULL_COVARIANCE:
new_covs.append(tf.expand_dims(new_cov, 0))
elif self._covariance_type == DIAG_COVARIANCE:
new_covs.append(tf.expand_dims(tf.diag_part(new_cov), 0))
new_covs = tf.concat(0, new_covs)
if 'c' in self._params:
# Train operations don't need to take care of the means
# because covariances already depend on it.
with tf.control_dependencies([self._means_op, new_covs]):
self._train_ops.append(
tf.assign(self._covs, new_covs, validate_shape=False))
def _define_maximization_operation(self, num_batches):
"""Maximization operations."""
# TODO(xavigonzalvo): some of these operations could be moved to C++.
# Compute the effective number of data points assigned to component k.
with tf.control_dependencies(self._w):
points_in_k = tf.squeeze(tf.add_n(self._points_in_k), squeeze_dims=[0])
# Update alpha.
if 'w' in self._params:
final_points_in_k = points_in_k / num_batches
num_examples = tf.to_float(tf.reduce_sum(final_points_in_k))
self._alpha_op = self._alpha.assign(
final_points_in_k / (num_examples + MEPS))
else:
self._alpha_op = tf.no_op()
self._train_ops = [self._alpha_op]
# Update means.
points_in_k_expanded = tf.reshape(points_in_k,
[self._num_classes, 1, 1])
if 'm' in self._params:
self._means_op = self._means.assign(
tf.div(tf.add_n(self._w_mul_x), points_in_k_expanded + MEPS))
else:
self._means_op = tf.no_op()
# means are (num_classes x 1 x dims)
# Update covariances.
with tf.control_dependencies([self._means_op]):
b = tf.add_n(self._w_mul_x2) / (points_in_k_expanded + MEPS)
new_covs = []
for k in range(self._num_classes):
mean = self._means.value()[k, :, :]
square_mean = tf.matmul(mean, mean, transpose_a=True)
new_cov = b[k, :, :] - square_mean + self._min_var
if self._covariance_type == FULL_COVARIANCE:
new_covs.append(tf.expand_dims(new_cov, 0))
elif self._covariance_type == DIAG_COVARIANCE:
new_covs.append(tf.expand_dims(tf.diag_part(new_cov), 0))
new_covs = tf.concat(0, new_covs)
if 'c' in self._params:
# Train operations don't need to take care of the means
# because covariances already depend on it.
with tf.control_dependencies([self._means_op, new_covs]):
self._train_ops.append(
tf.assign(self._covs, new_covs, validate_shape=False))
def likelihood(self, hyp, X_batch, y_batch, monitor=False):
M = self.M
Z = self.Z
m = self.m
S = self.S
jitter = self.jitter
jitter_cov = self.jitter_cov
N = tf.shape(X_batch)[0]
logsigma_n = hyp[-1]
sigma_n = tf.exp(logsigma_n)
# Compute K_u_inv
K_u = kernel_tf(Z, Z, hyp[:-1])
L = tf.cholesky(K_u + np.eye(M)*jitter_cov)
K_u_inv = tf.matrix_triangular_solve(tf.transpose(L), tf.matrix_triangular_solve(L, np.eye(M), lower=True), lower=False)
K_u_inv_op = self.K_u_inv.assign(K_u_inv)
# Compute mu
psi = kernel_tf(Z, X_batch, hyp[:-1])
K_u_inv_m = tf.matmul(K_u_inv, m)
MU = tf.matmul(tf.transpose(psi), K_u_inv_m)
# Compute cov
Alpha = tf.matmul(K_u_inv, psi)
COV = kernel_tf(X_batch, X_batch, hyp[:-1]) - tf.matmul(tf.transpose(psi), tf.matmul(K_u_inv,psi)) + \
tf.matmul(tf.transpose(Alpha), tf.matmul(S,Alpha))
# Compute COV_inv
LL = tf.cholesky(COV + tf.eye(N, dtype=tf.float64)*sigma_n + tf.eye(N, dtype=tf.float64)*jitter)
COV_inv = tf.matrix_triangular_solve(tf.transpose(LL), tf.matrix_triangular_solve(LL, tf.eye(N, dtype=tf.float64), lower=True), lower=False)
# Compute cov(Z, X)
cov_ZX = tf.matmul(S,Alpha)
# Update m and S
alpha = tf.matmul(COV_inv, tf.transpose(cov_ZX))
m_new = m + tf.matmul(cov_ZX, tf.matmul(COV_inv, y_batch-MU))
S_new = S - tf.matmul(cov_ZX, alpha)
if monitor == False:
m_op = self.m.assign(m_new)
S_op = self.S.assign(S_new)
# Compute NLML
K_u_inv_m = tf.matmul(K_u_inv, m_new)
NLML = 0.5*tf.matmul(tf.transpose(m_new), K_u_inv_m) + tf.reduce_sum(tf.log(tf.diag_part(L))) + 0.5*np.log(2.*np.pi)*tf.cast(M, tf.float64)
train = self.optimizer.minimize(NLML)
nlml_op = self.nlml.assign(NLML[0,0])
return tf.group(*[train, m_op, S_op, nlml_op, K_u_inv_op])
def block_Lanczos(Sigma_func,B_,n_mc_smps):
"""
block Lanczos method to approx Sigma^1/2 * B, with B matrix of N(0,1)'s.
Used to generate multiple approximate large normal draws.
"""
n = tf.shape(B_)[0] #dim of the multivariate normal
s = n_mc_smps #number of samples to draw
k = tf.div(n,500) + 3 #number of Lanczos iterations
betas = tf.zeros([1,s])
alphas = tf.zeros([0,s])
D = tf.zeros([s,n,1])
B_norms = tf.norm(B_,axis=0)
D = tf.concat([D,tf.expand_dims(tf.transpose(B_/B_norms),2)],2)
def cond(j,alphas,betas,D):
return j < k+1
#TODO: use block-CG in place of Sigma
def body(j,alphas,betas,D):
d_j = tf.squeeze(tf.slice(D,[0,0,j],[-1,-1,1]))
d = Sigma_func(tf.transpose(d_j)) - (tf.slice(betas,[j-1,0],[1,-1])*
tf.transpose(tf.squeeze(tf.slice(D,[0,0,j-1],[-1,-1,1]))))
alphas = tf.concat([alphas,[tf.diag_part(tf.matmul(d_j,d))]],0)
d = d - tf.slice(alphas,[j-1,0],[1,-1])*tf.transpose(d_j)
betas = tf.concat([betas,[tf.norm(d,axis=0)]],0)
D = tf.concat([D,tf.expand_dims(tf.transpose(d/tf.slice(betas,[j,0],[1,-1])),2)],2)
return j+1,alphas,betas,D
j = tf.constant(1)
j,alphas,betas,D = tf.while_loop(cond,body,loop_vars=[j,alphas,betas,D],
shape_invariants=[j.get_shape(),tf.TensorShape([None,None]),
tf.TensorShape([None,None]),tf.TensorShape([None,None,None])])
D_ = tf.slice(D,[0,0,1],[-1,-1,k])
##TODO: replace loop
H = tf.zeros([0,k,k])
for ss in range(s):
this_beta = tf.diag(tf.squeeze(tf.slice(betas,[1,ss],[k-1,1])))
#build out tridiagonal H: alphas_1:k on main, betas_2:k on off
this_H = (tf.diag(tf.squeeze(tf.slice(alphas,[0,ss],[-1,1]))) +
tf.pad(this_beta,[[1,0],[0,1]]) +
tf.pad(this_beta,[[0,1],[1,0]]))
H = tf.concat([H,tf.expand_dims(this_H,0)],0)
E,V = tf.self_adjoint_eig(H)
E_sqrt = tf.zeros([0,k,k])
#TODO: replace loop
for ss in range(s):
#ensure positive definite
E_sqrt = tf.concat([E_sqrt,tf.expand_dims(tf.diag(tf.squeeze(tf.sqrt(tf.maximum(tf.slice(E,[ss,0],[1,-1]),1e-6)))),0)],0)
sq_H = tf.matmul(V,tf.matmul(E_sqrt,tf.transpose(V,perm=[0,2,1])))
e1 = tf.expand_dims(tf.transpose(tf.tile(tf.slice(tf.eye(k),[0,0],[-1,1]),[1,s])),2)
out = B_norms*tf.transpose(tf.squeeze(tf.matmul(D_,tf.matmul(sq_H,e1))))
return out
def _mmd2_and_variance(K_XX, K_XY, K_YY, const_diagonal=False, biased=False):
m = tf.cast(K_XX.get_shape()[0], tf.float32) # Assumes X, Y are same shape
### Get the various sums of kernels that we'll use
# Kts drop the diagonal, but we don't need to compute them explicitly
if const_diagonal is not False:
const_diagonal = tf.cast(const_diagonal, tf.float32)
diag_X = diag_Y = const_diagonal
sum_diag_X = sum_diag_Y = m * const_diagonal
sum_diag2_X = sum_diag2_Y = m * const_diagonal**2
else:
diag_X = tf.diag_part(K_XX)
diag_Y = tf.diag_part(K_YY)
sum_diag_X = tf.reduce_sum(diag_X)
sum_diag_Y = tf.reduce_sum(diag_Y)
sum_diag2_X = sq_sum(diag_X)
sum_diag2_Y = sq_sum(diag_Y)
Kt_XX_sums = tf.reduce_sum(K_XX, 1) - diag_X
Kt_YY_sums = tf.reduce_sum(K_YY, 1) - diag_Y
K_XY_sums_0 = tf.reduce_sum(K_XY, 0)
K_XY_sums_1 = tf.reduce_sum(K_XY, 1)
Kt_XX_sum = tf.reduce_sum(Kt_XX_sums)
Kt_YY_sum = tf.reduce_sum(Kt_YY_sums)
K_XY_sum = tf.reduce_sum(K_XY_sums_0)
Kt_XX_2_sum = sq_sum(K_XX) - sum_diag2_X
Kt_YY_2_sum = sq_sum(K_YY) - sum_diag2_Y
K_XY_2_sum = sq_sum(K_XY)
if biased:
mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * m)
+ (Kt_YY_sum + sum_diag_Y) / (m * m)
- 2 * K_XY_sum / (m * m))
else:
mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * (m-1))
+ (Kt_YY_sum + sum_diag_Y) / (m * (m-1))
- 2 * K_XY_sum / (m * m))
var_est = (
2 / (m**2 * (m-1)**2) * (
2 * sq_sum(Kt_XX_sums) - Kt_XX_2_sum
+ 2 * sq_sum(Kt_YY_sums) - Kt_YY_2_sum)
- (4*m-6) / (m**3 * (m-1)**3) * (Kt_XX_sum**2 + Kt_YY_sum**2)
+ 4*(m-2) / (m**3 * (m-1)**2) * (
sq_sum(K_XY_sums_1) + sq_sum(K_XY_sums_0))
- 4 * (m-3) / (m**3 * (m-1)**2) * K_XY_2_sum
- (8*m - 12) / (m**5 * (m-1)) * K_XY_sum**2
+ 8 / (m**3 * (m-1)) * (
1/m * (Kt_XX_sum + Kt_YY_sum) * K_XY_sum
- dot(Kt_XX_sums, K_XY_sums_1)
- dot(Kt_YY_sums, K_XY_sums_0))
)
return mmd2, var_est