def gauss_KL(mu1, logstd1, mu2, logstd2):
""" Returns KL divergence among two multivariate Gaussians, component-wise.
It assumes the covariance matrix is diagonal. All inputs have shape (n,a).
It is not necessary to know the number of actions because reduce_sum will
sum over this to get the `d` constant offset. The part consisting of the
trace in the formula is blended with the mean difference squared due to the
common "denominator" of var2_na. This forumula generalizes for an arbitrary
number of actions. I think mu2 and logstd2 should represent the policy
before the update.
Returns the KL divergence for each of the n components in the minibatch,
then we do a reduce_mean outside this.
"""
var1_na = tf.exp(2.*logstd1)
var2_na = tf.exp(2.*logstd2)
tmp_matrix = 2.*(logstd2 - logstd1) + (var1_na + tf.square(mu1-mu2))/var2_na - 1
kl_n = tf.reduce_sum(0.5 * tmp_matrix, axis=[1]) # Don't forget the 1/2 !!
assert_op = tf.Assert(tf.reduce_all(kl_n >= -0.0000001), [kl_n])
with tf.control_dependencies([assert_op]):
kl_n = tf.identity(kl_n)
return kl_n
python类control_dependencies()的实例源码
def tune(self, acceptance_rate, fresh_start):
def adapt_stepsize():
new_step = tf.assign(self.step, (1 - fresh_start) * self.step + 1)
rate1 = tf.div(1.0, new_step + self.t0)
new_h_bar = tf.assign(
self.h_bar, (1 - fresh_start) * (1 - rate1) * self.h_bar +
rate1 * (self.delta - acceptance_rate))
log_epsilon = self.mu - tf.sqrt(new_step) / self.gamma * new_h_bar
rate = tf.pow(new_step, -self.kappa)
new_log_epsilon_bar = tf.assign(
self.log_epsilon_bar,
rate * log_epsilon + (1 - fresh_start) * (1 - rate) *
self.log_epsilon_bar)
with tf.control_dependencies([new_log_epsilon_bar]):
new_log_epsilon = tf.identity(log_epsilon)
return tf.exp(new_log_epsilon)
c = tf.cond(self.adapt_step_size,
adapt_stepsize,
lambda: tf.exp(self.log_epsilon_bar))
return c
def _adapt_mass(self, t, num_chain_dims):
ewmv = ExponentialWeightedMovingVariance(
self.mass_decay, self.data_shapes, num_chain_dims)
new_mass = tf.cond(self.adapt_mass,
lambda: ewmv.get_updated_precision(self.q),
lambda: ewmv.precision())
if not isinstance(new_mass, list):
new_mass = [new_mass]
# print('New mass is = {}'.format(new_mass))
# TODO incorrect shape?
# print('New mass={}'.format(new_mass))
# print('q={}, NMS={}'.format(self.q[0].get_shape(),
# new_mass[0].get_shape()))
with tf.control_dependencies(new_mass):
current_mass = tf.cond(
tf.less(tf.to_int32(t), self.mass_collect_iters),
lambda: [tf.ones(shape) for shape in self.data_shapes],
lambda: new_mass)
if not isinstance(current_mass, list):
current_mass = [current_mass]
return current_mass
def assert_rank_at_least(tensor, k, name):
"""
Whether the rank of `tensor` is at least k.
:param tensor: A tensor to be checked.
:param k: The least rank allowed.
:param name: The name of `tensor` for error message.
:return: The checked tensor.
"""
static_shape = tensor.get_shape()
shape_err_msg = '{} should have rank >= {}.'.format(name, k)
if static_shape and (static_shape.ndims < k):
raise ValueError(shape_err_msg)
if not static_shape:
_assert_shape_op = tf.assert_rank_at_least(
tensor, k, message=shape_err_msg)
with tf.control_dependencies([_assert_shape_op]):
tensor = tf.identity(tensor)
return tensor
def assert_scalar(tensor, name):
"""
Whether the `tensor` is a scalar (0-D tensor).
:param tensor: A tensor to be checked.
:param name: The name of `tensor` for error message.
:return: The checked tensor.
"""
static_shape = tensor.get_shape()
shape_err_msg = name + " should be a scalar (0-D tensor)."
if static_shape and (static_shape.ndims >= 1):
raise ValueError(shape_err_msg)
else:
_assert_shape_op = tf.assert_rank(tensor, 0, message=shape_err_msg)
with tf.control_dependencies([_assert_shape_op]):
tensor = tf.identity(tensor)
return tensor
def batch_norm(x, n_out, phase_train, scope='bn', decay=0.9, eps=1e-5):
"""
Code taken from http://stackoverflow.com/a/34634291/2267819
"""
with tf.variable_scope(scope):
beta = tf.get_variable(name='beta', shape=[n_out], initializer=tf.constant_initializer(0.0)
, trainable=True)
gamma = tf.get_variable(name='gamma', shape=[n_out], initializer=tf.random_normal_initializer(1.0, 0.02),
trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=decay)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train,
mean_var_with_update,
lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, eps)
return normed
def batch_norm(x, n_out, phase_train, scope='bn', decay=0.9, eps=1e-5):
"""
Code taken from http://stackoverflow.com/a/34634291/2267819
"""
with tf.variable_scope(scope):
beta = tf.get_variable(name='beta', shape=[n_out], initializer=tf.constant_initializer(0.0)
, trainable=True)
gamma = tf.get_variable(name='gamma', shape=[n_out], initializer=tf.random_normal_initializer(1.0, 0.02),
trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=decay)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train,
mean_var_with_update,
lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, eps)
return normed
def batch_normalization(x, scope, decay=0.999, eps=1e-6, training=True):
ndim = len(x.get_shape().as_list())
fdim = x.get_shape().as_list()[-1]
with tf.variable_scope(scope):
gamma = tf.get_variable("scale", [fdim], tf.float32, tf.constant_initializer(1.0))
beta = tf.get_variable("offset", [fdim], tf.float32, tf.constant_initializer(0.0))
mean = tf.get_variable("mean", [fdim], tf.float32, tf.constant_initializer(0.0), trainable=False)
var = tf.get_variable("variance", [fdim], tf.float32, tf.constant_initializer(1.0), trainable=False)
if training:
x_mean, x_var = tf.nn.moments(x, range(ndim - 1))
avg_mean = tf.assign(mean, mean * decay + x_mean * (1.0 - decay))
avg_var = tf.assign(var, var * decay + x_var * (1.0 - decay))
with tf.control_dependencies([avg_mean, avg_var]):
return tf.nn.batch_normalization(x, x_mean, x_var, beta, gamma, eps)
else:
return tf.nn.batch_normalization(x, mean, var, beta, gamma, eps)
def batch_normalization_with_mask(x, mask, scope, decay=0.999, eps=1e-6, training=True):
ndim = len(x.get_shape().as_list())
fdim = x.get_shape().as_list()[-1]
with tf.variable_scope(scope):
gamma = tf.get_variable("scale", [fdim], tf.float32, tf.constant_initializer(1.0))
beta = tf.get_variable("offset", [fdim], tf.float32, tf.constant_initializer(0.0))
mean = tf.get_variable("mean", [fdim], tf.float32, tf.constant_initializer(0.0), trainable=False)
var = tf.get_variable("variance", [fdim], tf.float32, tf.constant_initializer(1.0), trainable=False)
if training:
x_mean, x_var = tf.nn.weighted_moments(x, range(ndim - 1), mask)
avg_mean = tf.assign(mean, mean * decay + x_mean * (1.0 - decay))
avg_var = tf.assign(var, var * decay + x_var * (1.0 - decay))
with tf.control_dependencies([avg_mean, avg_var]):
return tf.nn.batch_normalization(x, x_mean, x_var, beta, gamma, eps)
else:
return tf.nn.batch_normalization(x, mean, var, beta, gamma, eps)
text_classification_train.py 文件源码
项目:kaggle_redefining_cancer_treatment
作者: jorgemf
项目源码
文件源码
阅读 82
收藏 0
点赞 0
评论 0
def model(self, input_text_begin, input_text_end, gene, variation, batch_size,
vocabulary_size=VOCABULARY_SIZE, embeddings_size=EMBEDDINGS_SIZE, output_classes=9):
# embeddings
embeddings = _load_embeddings(vocabulary_size, embeddings_size)
# global step
self.global_step = training_util.get_or_create_global_step()
self.global_step = tf.assign_add(self.global_step, 1)
# model
with tf.control_dependencies([self.global_step]):
with slim.arg_scope(self.text_classification_model.model_arg_scope()):
self.outputs = self.text_classification_model.model(input_text_begin, input_text_end,
gene, variation, output_classes,
embeddings=embeddings,
batch_size=batch_size,
training=False)
# restore only the trainable variables
self.saver = tf.train.Saver(var_list=tf_variables.trainable_variables())
return self.outputs
def __call__(self, x):
shape = x.get_shape()
shp = self.in_dim or shape[-1]
with tf.variable_scope(self.name) as scope:
self.gamma = tf.get_variable("gamma", [shp],
initializer=tf.random_normal_initializer(1., 0.02))
self.beta = tf.get_variable("beta", [shp],
initializer=tf.constant_initializer(0.))
self.mean, self.variance = tf.nn.moments(x, [0, 1, 2])
self.mean.set_shape((shp,))
self.variance.set_shape((shp,))
self.ema_apply_op = self.ema.apply([self.mean, self.variance])
if self.train:
# with tf.control_dependencies([self.ema_apply_op]):
normalized_x = tf.nn.batch_norm_with_global_normalization(
x, self.mean, self.variance, self.beta, self.gamma, self.epsilon,
scale_after_normalization=True)
else:
normalized_x = tf.nn.batch_norm_with_global_normalization(
x, self.ema.average(self.mean), self.ema.average(self.variance), self.beta,
self.gamma, self.epsilon,
scale_after_normalization=True)
return normalized_x
def __init__(self, config, model):
assert isinstance(model, Model)
self.config = config
self.model = model
self.opt = tf.train.AdagradOptimizer(config.init_lr)
self.loss = model.get_loss()
self.var_list = model.get_var_list()
self.global_step = model.get_global_step()
self.ema_op = model.ema_op
self.summary = model.summary
self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
opt_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
# Define train op
with tf.control_dependencies([opt_op]):
self.train_op = tf.group(self.ema_op)
def grad_variance(self):
grad_var_ops = []
tensor_to_avg = []
for t, g in zip(self._tvars, self._grads):
if isinstance(g, ops.IndexedSlices):
tensor_to_avg.append(
tf.reshape(tf.unsorted_segment_sum(
g.values, g.indices, g.dense_shape[0]),
shape=t.get_shape()))
else:
tensor_to_avg.append(g)
avg_op = self._moving_averager.apply(tensor_to_avg)
grad_var_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._grad_avg = [
self._moving_averager.average(val) for val in tensor_to_avg]
self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
self._grad_var = tf.maximum(
tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
self._grad_norm_squared_avg
- tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared] ) )
if self._sparsity_debias:
self._grad_var *= self._sparsity_avg
return grad_var_ops
def dist_to_opt(self):
dist_to_opt_ops = []
# running average of the norm of gradeint
self._grad_norm = tf.sqrt(self._grad_norm_squared)
avg_op = self._moving_averager.apply([self._grad_norm, ])
dist_to_opt_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._grad_norm_avg = self._moving_averager.average(
self._grad_norm)
# single iteration distance estimation
# note that self._grad_norm_avg is per variable
self._dist_to_opt = (self._grad_norm_avg
/ (self._grad_norm_squared_avg + EPS) )
# running average of distance
avg_op = self._moving_averager.apply([self._dist_to_opt])
dist_to_opt_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._dist_to_opt_avg = tf.identity(
self._moving_averager.average(self._dist_to_opt))
if self._sparsity_debias:
self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS)
return dist_to_opt_ops
def get_cubic_root(self):
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
# where x = sqrt(mu).
# We substitute x, which is sqrt(mu), with x = y + 1.
# It gives y^3 + py = q
# where p = (D^2 h_min^2)/(2*C) and q = -p.
# We use the Vieta's substution to compute the root.
# There is only one real solution y (which is in [0, 1] ).
# http://mathworld.wolfram.com/VietasSubstitution.html
# assert_array = \
# [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
# tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]),
# tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])]
# with tf.control_dependencies(assert_array):
# EPS in the numerator to prevent momentum being exactly one in case of 0 gradient
p = (self._dist_to_opt_avg + EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS)
w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0/3.0)
y = w - p / 3.0 / (w + EPS)
x = y + 1
return x
def update_hyper_param(self):
assign_hyper_ops = []
self._mu = tf.identity(tf.cond(
self._do_tune, lambda: self.get_mu_tensor(),
lambda: self._mu_var))
with tf.control_dependencies([self._mu]):
self._lr = tf.identity(tf.cond(
self._do_tune, lambda: self.get_lr_tensor(),
lambda: self._lr_var))
with tf.control_dependencies([self._mu, self._lr]):
if self._use_unsmoothed_lr_mu:
assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
else:
self._mu = self._beta * self._mu_var + (1 - self._beta) * self._mu
self._lr = self._beta * self._lr_var + (1 - self._beta) * self._lr
with tf.control_dependencies([self._mu, self._lr] ):
assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
assign_hyper_op = tf.group(*assign_hyper_ops)
return assign_hyper_op
def get_output_for(self, input, phase='train', **kwargs):
if phase == 'train':
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(input, self.axis, shift=self.moving_mean)
# Update the moving_mean and moving_variance moments.
update_moving_mean = moving_averages.assign_moving_average(
self.moving_mean, mean, self.decay)
update_moving_variance = moving_averages.assign_moving_average(
self.moving_variance, variance, self.decay)
# Make sure the updates are computed here.
with tf.control_dependencies([update_moving_mean,
update_moving_variance]):
output = tf.nn.batch_normalization(
input, mean, variance, self.beta, self.gamma, self.epsilon)
else:
output = tf.nn.batch_normalization(
input, self.moving_mean, self.moving_variance, self.beta, self.gamma, self.epsilon)
output.set_shape(self.input_shape)
return output
def get_function_init_state(self, function_tokens):
next_state = tf.gather(self.function_states, function_tokens - (self.num_begin_tokens + self.num_control_tokens))
assert2 = tf.Assert(tf.reduce_all(next_state >= 0), [function_tokens])
with tf.control_dependencies([assert2]):
return tf.identity(next_state)