def ae(x):
if nonlinearity_name == 'relu':
f = tf.nn.relu
elif nonlinearity_name == 'elu':
f = tf.nn.elu
elif nonlinearity_name == 'gelu':
# def gelu(x):
# return tf.mul(x, tf.erfc(-x / tf.sqrt(2.)) / 2.)
# f = gelu
def gelu_fast(_x):
return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))
f = gelu_fast
elif nonlinearity_name == 'silu':
def silu(_x):
return _x * tf.sigmoid(_x)
f = silu
# elif nonlinearity_name == 'soi':
# def soi_map(x):
# u = tf.random_uniform(tf.shape(x))
# mask = tf.to_float(tf.less(u, (1 + tf.erf(x / tf.sqrt(2.))) / 2.))
# return tf.cond(is_training, lambda: tf.mul(mask, x),
# lambda: tf.mul(x, tf.erfc(-x / tf.sqrt(2.)) / 2.))
# f = soi_map
else:
raise NameError("Need 'relu', 'elu', 'gelu', or 'silu' for nonlinearity_name")
h1 = f(tf.matmul(x, W['1']) + b['1'])
h2 = f(tf.matmul(h1, W['2']) + b['2'])
h3 = f(tf.matmul(h2, W['3']) + b['3'])
h4 = f(tf.matmul(h3, W['4']) + b['4'])
h5 = f(tf.matmul(h4, W['5']) + b['5'])
h6 = f(tf.matmul(h5, W['6']) + b['6'])
h7 = f(tf.matmul(h6, W['7']) + b['7'])
return tf.matmul(h7, W['8']) + b['8']
python类pow()的实例源码
def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]):
sigma_2 = sigma ** 2
box_diff = bbox_pred - bbox_targets
in_box_diff = bbox_inside_weights * box_diff
abs_in_box_diff = tf.abs(in_box_diff)
smoothL1_sign = tf.stop_gradient(tf.to_float(tf.less(abs_in_box_diff, 1. / sigma_2)))
in_loss_box = tf.pow(in_box_diff, 2) * (sigma_2 / 2.) * smoothL1_sign \
+ (abs_in_box_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign)
out_loss_box = bbox_outside_weights * in_loss_box
loss_box = tf.reduce_mean(tf.reduce_sum(
out_loss_box,
axis=dim
))
return loss_box
def _anneal_weight(init_val, final_val, anneal_type, global_step, anneal_steps, hold_for=0., steps_div=1.,
dtype=tf.float64):
val, final, step, hold_for, anneal_steps, steps_div = (tf.cast(i, dtype) for i in
(init_val, final_val, global_step, hold_for, anneal_steps, steps_div))
step = tf.maximum(step - hold_for, 0.)
if anneal_type == 'exp':
decay_rate = tf.pow(final / val, steps_div / anneal_steps)
val = tf.train.exponential_decay(val, step, steps_div, decay_rate)
elif anneal_type == 'linear':
val = final + (val - final) * (1. - step / anneal_steps)
else:
raise NotImplementedError
anneal_weight = tf.maximum(final, val)
return anneal_weight
def _embed_sentences(self):
"""Tensorflow implementation of Simple but Tough-to-Beat Baseline"""
# Get word features
word_embeddings = self._get_embedding()
word_feats = tf.nn.embedding_lookup(word_embeddings, self.input)
# Get marginal estimates and scaling term
batch_size = tf.shape(word_feats)[0]
a = tf.pow(10.0, self._get_a_exp())
p = tf.constant(self.marginals, dtype=tf.float32, name='marginals')
q = tf.reshape(
a / (a + tf.nn.embedding_lookup(p, self.input)),
(batch_size, self.mx_len, 1)
)
# Compute initial sentence embedding
z = tf.reshape(1.0 / tf.to_float(self.input_lengths), (batch_size, 1))
S = z * tf.reduce_sum(q * word_feats, axis=1)
# Compute common component
S_centered = S - tf.reduce_mean(S, axis=0)
_, _, V = tf.svd(S_centered, full_matrices=False, compute_uv=True)
self.tf_ccx = tf.stop_gradient(tf.gather(tf.transpose(V), 0))
# Common component removal
ccx = tf.reshape(self._get_common_component(), (1, self.d))
sv = {'embeddings': word_embeddings, 'a': a, 'p': p, 'ccx': ccx}
return S - tf.matmul(S, ccx * tf.transpose(ccx)), sv
def adam_updates(params, cost_or_grads, lr=0.001, mom1=0.9, mom2=0.999):
''' Adam optimizer '''
updates = []
if type(cost_or_grads) is not list:
grads = tf.gradients(cost_or_grads, params)
else:
grads = cost_or_grads
t = tf.Variable(1., 'adam_t')
for p, g in zip(params, grads):
mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg')
if mom1 > 0:
v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v')
v_t = mom1 * v + (1. - mom1) * g
v_hat = v_t / (1. - tf.pow(mom1, t))
updates.append(v.assign(v_t))
else:
v_hat = g
mg_t = mom2 * mg + (1. - mom2) * tf.square(g)
mg_hat = mg_t / (1. - tf.pow(mom2, t))
g_t = v_hat / tf.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append(mg.assign(mg_t))
updates.append(p.assign(p_t))
updates.append(t.assign_add(1))
return tf.group(*updates)
def tune(self, acceptance_rate, fresh_start):
def adapt_stepsize():
new_step = tf.assign(self.step, (1 - fresh_start) * self.step + 1)
rate1 = tf.div(1.0, new_step + self.t0)
new_h_bar = tf.assign(
self.h_bar, (1 - fresh_start) * (1 - rate1) * self.h_bar +
rate1 * (self.delta - acceptance_rate))
log_epsilon = self.mu - tf.sqrt(new_step) / self.gamma * new_h_bar
rate = tf.pow(new_step, -self.kappa)
new_log_epsilon_bar = tf.assign(
self.log_epsilon_bar,
rate * log_epsilon + (1 - fresh_start) * (1 - rate) *
self.log_epsilon_bar)
with tf.control_dependencies([new_log_epsilon_bar]):
new_log_epsilon = tf.identity(log_epsilon)
return tf.exp(new_log_epsilon)
c = tf.cond(self.adapt_step_size,
adapt_stepsize,
lambda: tf.exp(self.log_epsilon_bar))
return c
def update(self, x):
# x: (chain_dims data_dims)
new_t = tf.assign(self.t, self.t + 1)
weight = (1 - self.decay) / (1 - tf.pow(self.decay, new_t))
# incr: (chain_dims data_dims)
incr = [weight * (q - mean) for q, mean in zip(x, self.mean)]
# mean: (1,...,1 data_dims)
update_mean = [mean.assign_add(
tf.reduce_mean(i, axis=self.chain_axes, keep_dims=True))
for mean, i in zip(self.mean, incr)]
# var: (1,...,1 data_dims)
new_var = [
(1 - weight) * var +
tf.reduce_mean(i * (q - mean), axis=self.chain_axes,
keep_dims=True)
for var, i, q, mean in zip(self.var, incr, x, update_mean)]
update_var = [tf.assign(var, n_var)
for var, n_var in zip(self.var, new_var)]
return update_var
def __init__(self, n_features, lenscale=1.0, p=1, variational=False,
lenscale_posterior=None):
"""Create an instance of an arc cosine kernel layer."""
# Setup random weights
if variational:
kern = RBFVariational(lenscale=lenscale,
lenscale_posterior=lenscale_posterior)
else:
kern = RBF(lenscale=lenscale)
super().__init__(n_features=n_features, kernel=kern)
# Kernel order
assert isinstance(p, int) and p >= 0
if p == 0:
self.pfunc = tf.sign
elif p == 1:
self.pfunc = lambda x: x
else:
self.pfunc = lambda x: tf.pow(x, p)
gradient_moment.py 文件源码
项目:probabilistic_line_search
作者: ProbabilisticNumerics
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def _MatMulGradMom(op, W, out_grad, batch_size, mom=2):
"""Computes gradient moment for a weight matrix through a MatMul operation.
Assumes ``Z=tf.matmul(A, W)``, where ``W`` is a d1xd2 weight matrix, ``A``
are the nxd1 activations of the previous layer (n being the batch size).
``out_grad`` is the gradient w.r.t. ``Z``, as computed by ``tf.gradients()``.
No transposes in the MatMul operation allowed.
Inputs:
:op: The MatMul operation
:W: The weight matrix (the tensor, not the variable)
:out_grad: The tensor of gradient w.r.t. to the output of the op
:batch_size: Batch size n (constant integer or scalar int tf.Tensor)
:mom: Integer moment desired (defaults to 2)"""
assert op.type == "MatMul"
t_a, t_b = op.get_attr("transpose_a"), op.get_attr("transpose_b")
assert W is op.inputs[1] and not t_a and not t_b
A = op.inputs[0]
out_grad_pow = tf.pow(out_grad, mom)
A_pow = tf.pow(A, mom)
return tf.mul(batch_size, tf.matmul(A_pow, out_grad_pow, transpose_a=True))
def get_cubic_root(self):
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
# where x = sqrt(mu).
# We substitute x, which is sqrt(mu), with x = y + 1.
# It gives y^3 + py = q
# where p = (D^2 h_min^2)/(2*C) and q = -p.
# We use the Vieta's substution to compute the root.
# There is only one real solution y (which is in [0, 1] ).
# http://mathworld.wolfram.com/VietasSubstitution.html
# assert_array = \
# [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
# tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]),
# tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])]
# with tf.control_dependencies(assert_array):
# EPS in the numerator to prevent momentum being exactly one in case of 0 gradient
p = (self._dist_to_opt_avg + EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS)
w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0/3.0)
y = w - p / 3.0 / (w + EPS)
x = y + 1
return x
def lppool(inpOp, pnorm, kH, kW, dH, dW, padding, name):
with tf.variable_scope(name):
if pnorm == 2:
pwr = tf.square(inpOp)
else:
pwr = tf.pow(inpOp, pnorm)
subsamp = tf.nn.avg_pool(pwr,
ksize=[1, kH, kW, 1],
strides=[1, dH, dW, 1],
padding=padding)
subsamp_sum = tf.multiply(subsamp, kH*kW)
if pnorm == 2:
out = tf.sqrt(subsamp_sum)
else:
out = tf.pow(subsamp_sum, 1/pnorm)
return out
def update_target_network(source_network, target_network, update_rate):
target_network_update = []
for v in source_network.variables():
# this is equivalent to target = (1-alpha) * target + alpha * source
# print ("source: " + v.name + " : " + str(v.get_shape()))
pass
for v in target_network.variables():
# this is equivalent to target = (1-alpha) * target + alpha * source
# print ("target: " + v.name + " : " + str(v.get_shape()))
pass
for v_source, v_target in zip(source_network.variables(), target_network.variables()):
# this is equivalent to target = (1-alpha) * target + alpha * source
update_op = v_target.assign_sub(update_rate * (v_target - v_source))
target_network_update.append(update_op)
return tf.group(*target_network_update)
# def concat_nn_input(self, input1, input2):
# return tf.concat(1, [input1, input2])
# def add_pow_values(self, values):
# return self.concat_nn_input(values, 0.01 * tf.pow(values, [2 for i in range(self.action_size)]))
def loss_with_spring(self):
margin = 5.0
labels_t = self.y_
labels_f = tf.subtract(1.0, self.y_, name="1-yi") # labels_ = !labels;
eucd2 = tf.pow(tf.subtract(self.o1, self.o2), 2)
eucd2 = tf.reduce_sum(eucd2, 1)
eucd = tf.sqrt(eucd2+1e-6, name="eucd")
C = tf.constant(margin, name="C")
# yi*||CNN(p1i)-CNN(p2i)||^2 + (1-yi)*max(0, C-||CNN(p1i)-CNN(p2i)||^2)
pos = tf.multiply(labels_t, eucd2, name="yi_x_eucd2")
# neg = tf.multiply(labels_f, tf.subtract(0.0,eucd2), name="yi_x_eucd2")
# neg = tf.multiply(labels_f, tf.maximum(0.0, tf.subtract(C,eucd2)), name="Nyi_x_C-eucd_xx_2")
neg = tf.multiply(labels_f, tf.pow(tf.maximum(tf.subtract(C, eucd), 0), 2), name="Nyi_x_C-eucd_xx_2")
losses = tf.add(pos, neg, name="losses")
loss = tf.reduce_mean(losses, name="loss")
return loss
def lppool(inpOp, pnorm, kH, kW, dH, dW, padding):
global pool_counter
global parameters
name = 'pool' + str(pool_counter)
pool_counter += 1
with tf.name_scope('lppool'):
if pnorm == 2:
pwr = tf.square(inpOp)
else:
pwr = tf.pow(inpOp, pnorm)
subsamp = tf.nn.avg_pool(pwr,
ksize=[1, kH, kW, 1],
strides=[1, dH, dW, 1],
padding=padding,
name=name)
subsamp_sum = tf.mul(subsamp, kH*kW)
if pnorm == 2:
out = tf.sqrt(subsamp_sum)
else:
out = tf.pow(subsamp_sum, 1/pnorm)
return out
def adam_updates(params, cost_or_grads, lr=0.001, mom1=0.9, mom2=0.999):
''' Adam optimizer '''
updates = []
if type(cost_or_grads) is not list:
grads = tf.gradients(cost_or_grads, params)
else:
grads = cost_or_grads
t = tf.Variable(1., 'adam_t')
for p, g in zip(params, grads):
mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg')
if mom1>0:
v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v')
v_t = mom1*v + (1. - mom1)*g
v_hat = v_t / (1. - tf.pow(mom1,t))
updates.append(v.assign(v_t))
else:
v_hat = g
mg_t = mom2*mg + (1. - mom2)*tf.square(g)
mg_hat = mg_t / (1. - tf.pow(mom2,t))
g_t = v_hat / tf.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append(mg.assign(mg_t))
updates.append(p.assign(p_t))
updates.append(t.assign_add(1))
return tf.group(*updates)
def weighted_loss(y_true, y_softmax_conv, weight):
"""Compute weighted loss function per pixel.
Loss = (1 - softmax(logits)) * targets * weight + softmax(logits) * (1 - targets) * weight
Argument:
y_true: [batch_size, depth, height, width, 1]
weight_map: [batch_size, depth, height, width, 1]
y_softmax_conv: [batch_size, depth, height, width, 2]
"""
y_true = tf.to_float(tf.reshape(y_true[..., 0], [-1]))
weight = tf.to_float(tf.reshape(weight[..., 0], [-1]))
y_conv = tf.to_float(tf.reshape(y_softmax_conv[..., 1], [-1]))
loss_pos = 1 / 2 * tf.pow((1 - y_conv), 2) * y_true * weight
loss_neg = 1 / 2 * tf.pow(y_conv, 2) * (1 - y_true) * weight
return tf.reduce_mean(loss_pos + loss_neg)
def apply_update(self, optimizer, grads_and_vars):
(grads, vars) = zip(*grads_and_vars)
# Gradient clipping
if CustomTrainer.GRADIENT_CLIP in self.train_hypers:
grads, global_norm = clip_ops.clip_by_global_norm(grads,
self.train_hypers[CustomTrainer.GRADIENT_CLIP])
# Gradient noise
if CustomTrainer.GRADIENT_NOISE in self.train_hypers:
sigma_sqr = self.train_hypers[CustomTrainer.GRADIENT_NOISE]
if CustomTrainer.GRADIENT_NOISE_DECAY in self.train_hypers:
sigma_sqr /= tf.pow(1.0 + tf.to_float(self.global_step),
self.train_hypers[CustomTrainer.GRADIENT_NOISE_DECAY])
grads_tmp = []
for g in grads:
if g is not None:
noisy_grad = g + tf.sqrt(sigma_sqr)*tf.random_normal(tf.shape(g))
grads_tmp.append(noisy_grad)
else:
grads_tmp.append(g)
grads = grads_tmp
train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.global_step)
return train_op
def __init__(self, lin, lout, iniRange, graph= None):
if graph!=None:
with graph.as_default():
self.v = tf.Variable(tf.random_uniform([lin, lout], iniRange[0], iniRange[1]))
self.g = tf.Variable(tf.random_uniform([lout], -1.0,1.0))
self.pow2 = tf.fill([lin, lout],2.0)
self.v_norm = tf.sqrt(tf.reduce_sum(tf.pow(self.v, self.pow2),0))
self.tile_div = tf.tile(tf.expand_dims(tf.div(self.g, self.v_norm),0),[lin, 1])
self.w = tf.mul(self.tile_div, self.v)
else:
self.v = tf.Variable(tf.random_uniform([lin, lout], -1/math.sqrt(lin), 1/math.sqrt(lin)))
self.g = tf.Variable(tf.random_uniform([lout], -1.0,1.0))
self.pow2 = tf.fill([lin, lout],2.0)
self.v_norm = tf.sqrt(tf.reduce_sum(tf.pow(self.v, self.pow2),0))
self.tile_div = tf.tile(tf.expand_dims(tf.div(self.g, self.v_norm),0),[lin, 1])
self.w = tf.mul(self.tile_div, self.v)
def lppool(inpOp, pnorm, kH, kW, dH, dW, padding, name):
with tf.variable_scope(name):
if pnorm == 2:
pwr = tf.square(inpOp)
else:
pwr = tf.pow(inpOp, pnorm)
subsamp = tf.nn.avg_pool(pwr,
ksize=[1, kH, kW, 1],
strides=[1, dH, dW, 1],
padding=padding)
subsamp_sum = tf.multiply(subsamp, kH*kW)
if pnorm == 2:
out = tf.sqrt(subsamp_sum)
else:
out = tf.pow(subsamp_sum, 1/pnorm)
return out
def gauss(mean, stddev, ksize):
"""Use Tensorflow to compute a Gaussian Kernel.
Parameters
----------
mean : float
Mean of the Gaussian (e.g. 0.0).
stddev : float
Standard Deviation of the Gaussian (e.g. 1.0).
ksize : int
Size of kernel (e.g. 16).
Returns
-------
kernel : np.ndarray
Computed Gaussian Kernel using Tensorflow.
"""
g = tf.Graph()
with tf.Session(graph=g):
x = tf.linspace(-3.0, 3.0, ksize)
z = (tf.exp(tf.neg(tf.pow(x - mean, 2.0) /
(2.0 * tf.pow(stddev, 2.0)))) *
(1.0 / (stddev * tf.sqrt(2.0 * 3.1415))))
return z.eval()
def gauss(mean, stddev, ksize):
"""Use Tensorflow to compute a Gaussian Kernel.
Parameters
----------
mean : float
Mean of the Gaussian (e.g. 0.0).
stddev : float
Standard Deviation of the Gaussian (e.g. 1.0).
ksize : int
Size of kernel (e.g. 16).
Returns
-------
kernel : np.ndarray
Computed Gaussian Kernel using Tensorflow.
"""
g = tf.Graph()
with tf.Session(graph=g):
x = tf.linspace(-3.0, 3.0, ksize)
z = (tf.exp(tf.neg(tf.pow(x - mean, 2.0) /
(2.0 * tf.pow(stddev, 2.0)))) *
(1.0 / (stddev * tf.sqrt(2.0 * 3.1415))))
return z.eval()
def adam_updates(params, cost_or_grads, lr=0.001, B1=0.9, B2=0.999):
''' Adam optimizer '''
updates = []
if type(cost_or_grads) is not list:
grads = tf.gradients(cost_or_grads, params)
else:
grads = cost_or_grads
t = tf.Variable(1., 'adam_t')
for p, g in zip(params, grads):
v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v')
if B1>0:
m = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_m')
m_t = B1*m + (1. - B1)*g
m_hat = m_t / (1. - tf.pow(B1,t))
updates.append(m.assign(m_t))
else:
m_hat = g
v_t = B2*v + (1. - B2)*tf.square(g)
v_hat = v_t / (1. - tf.pow(B2,t))
g_t = m_hat / tf.sqrt(v_hat + 1e-8)
p_t = p - lr * g_t
updates.append(v.assign(v_t))
updates.append(p.assign(p_t))
updates.append(t.assign_add(1))
return tf.group(*updates)
def address(M0, w0, head):
# Content focusing
# Compute cosine similarity
key = tf.expand_dims(head["key"], 1)
key_matches = tf.batch_matmul(key, tf.transpose(M0, [0, 2, 1]))
key_matches = tf.squeeze(key_matches)
key_mag = tf.expand_dims(NTMCell.magnitude(head["key"], 1), 1)
M_col_mag = NTMCell.magnitude(M0, 2)
cosine_sim = key_matches / (key_mag * M_col_mag)
# Compute content weights
wc = tf.nn.softmax(head["key_str"] * cosine_sim)
# Location focusing
wg = head["interp"] * wc + (1 - head["interp"]) * w0
ws = rotate.ntm_rotate(wg, head["shift"])
ws_pow = tf.pow(ws, head["sharp"])
w1 = ws_pow / tf.reduce_sum(ws_pow, 1, keep_dims=True)
return w1
def build_model(self):
self.input_y = tf.placeholder(tf.float32, [None,self.num_class], name="input_y") # 1*1, 1doc
self.one_hot = tf.reshape(tf.cast(tf.one_hot(tf.cast(self.input_y, tf.int32), 2,0,1), tf.float32), [-1,2])
self.recon_loss = -tf.reduce_sum(tf.log(0.0001 + tf.gather(self.p_xi_h, self.x_id)))
self.KL = -0.5 * tf.reduce_sum(1.0 + self.hlogvar - tf.pow(self.hmean, 2)\
- tf.exp(self.hlogvar), reduction_indices = 1)
self.loss = tf.reduce_mean(0.0001 * self.KL + self.recon_loss)
self.optimizer = tf.train.AdamOptimizer(self.learning_rate,0.9)
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
self.capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in self.grads_and_vars]
self.train_op = self.optimizer.apply_gradients(self.capped_gvs)
#self.optimizer = tf.train.AdamOptimizer(self.learning_rate,beta1=0.9).minimize(self.loss)
self.init = tf.initialize_all_variables()
self.sess.run(self.init)
def get_total_variation(x, shape):
with tf.name_scope('get_total_variation'):
# Get the dimensions of the variable image
height = shape[1]
width = shape[2]
size = reduce(lambda a, b: a * b, shape) ** 2
# Disjoin the variable image and evaluate the total variation
x_cropped = x[:, :height - 1, :width - 1, :]
left_term = tf.square(x[:, 1:, :width - 1, :] - x_cropped)
right_term = tf.square(x[:, :height - 1, 1:, :] - x_cropped)
smoothed_terms = tf.pow(left_term + right_term, TOTAL_VARIATION_SMOOTHING / 2.)
return tf.reduce_sum(smoothed_terms) / size
# Parse arguments and assign them to their respective global variables
def lppool(inpOp, pnorm, kH, kW, dH, dW, padding, name):
with tf.variable_scope(name):
if pnorm == 2:
pwr = tf.square(inpOp)
else:
pwr = tf.pow(inpOp, pnorm)
subsamp = tf.nn.avg_pool(pwr,
ksize=[1, kH, kW, 1],
strides=[1, dH, dW, 1],
padding=padding)
subsamp_sum = tf.mul(subsamp, kH*kW)
if pnorm == 2:
out = tf.sqrt(subsamp_sum)
else:
out = tf.pow(subsamp_sum, 1/pnorm)
return out
def chi2(exp, obs):
"""
Compute CHI^2 statistics of non-zero expected elements
"""
zero = tf.constant(0, dtype=tf.float32)
mask = tf.not_equal(exp, zero)
def masking(tensor, mask):
return tf.boolean_mask(tensor, mask)
stat = tf.reduce_sum(
tf.div(
tf.pow(
tf.subtract(masking(obs, mask), masking(exp, mask)),
2),
masking(exp, mask)),
name="chi2_statistics")
return stat
def _apply_dense(self, grad, var):
lr = (self._lr_t *
math_ops.sqrt(1 - self._beta2_power)
/ (1 - self._beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_scaled_g_values = grad * (1 - self._beta1_t)
m_t = m * self._beta1_t
m_t = m_t + m_scaled_g_values
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_scaled_g_values = tf.pow(grad, 2) * (1 - self._beta2_t)
v_t = v * self._beta2_t
v_t = v_t + v_scaled_g_values
v_sqrt = tf.pow(v_t, self._pow_t)
var_update = state_ops.assign_sub(var,
lr * m_t / (v_sqrt + self._epsilon_t),
use_locking=self._use_locking)
# regularization
var_update = state_ops.assign_sub(var_update,
self._dense_regularization * var,
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t])
def scaled_dot_product_attention_simple(q, k, v, bias, name=None):
"""scaled dot-product attention. One head. One spatial dimension.
Args:
q: a Tensor with shape [batch, length_q, depth_k]
k: a Tensor with shape [batch, length_kv, depth_k]
v: a Tensor with shape [batch, length_kv, depth_v]
bias: optional Tensor broadcastable to [batch, length_q, length_kv]
name: an optional string
Returns:
A Tensor.
"""
with tf.variable_scope(
name, default_name="scaled_dot_product_attention_simple"):
scalar = tf.rsqrt(tf.to_float(common_layers.shape_list(q)[2]))
logits = tf.matmul(q * scalar, k, transpose_b=True)
if bias is not None:
logits += bias
weights = tf.nn.softmax(logits, name="attention_weights")
tf.summary.image(
"attention", tf.expand_dims(tf.pow(weights, 0.2), 3), max_outputs=1)
return tf.matmul(weights, v)
def gauss(mean, stddev, ksize):
"""Use Tensorflow to compute a Gaussian Kernel.
Parameters
----------
mean : float
Mean of the Gaussian (e.g. 0.0).
stddev : float
Standard Deviation of the Gaussian (e.g. 1.0).
ksize : int
Size of kernel (e.g. 16).
Returns
-------
kernel : np.ndarray
Computed Gaussian Kernel using Tensorflow.
"""
g = tf.Graph()
with tf.Session(graph=g):
x = tf.linspace(-3.0, 3.0, ksize)
z = (tf.exp(tf.neg(tf.pow(x - mean, 2.0) /
(2.0 * tf.pow(stddev, 2.0)))) *
(1.0 / (stddev * tf.sqrt(2.0 * 3.1415))))
return z.eval()