def layer_normalization(self,x):
"""
x should be:[batch_size,sequence_length,d_model]
:return:
"""
filter=x.get_shape()[-1] #last dimension of x. e.g. 512
print("layer_normalization:==================>variable_scope:","layer_normalization"+str(self.layer_index)+self.type)
with tf.variable_scope("layer_normalization"+str(self.layer_index)+self.type):
# 1. normalize input by using mean and variance according to last dimension
mean=tf.reduce_mean(x,axis=-1,keep_dims=True) #[batch_size,sequence_length,1]
variance=tf.reduce_mean(tf.square(x-mean),axis=-1,keep_dims=True) #[batch_size,sequence_length,1]
norm_x=(x-mean)*tf.rsqrt(variance+1e-6) #[batch_size,sequence_length,d_model]
# 2. re-scale normalized input back
scale=tf.get_variable("layer_norm_scale",[filter],initializer=tf.ones_initializer) #[filter]
bias=tf.get_variable("layer_norm_bias",[filter],initializer=tf.ones_initializer) #[filter]
output=norm_x*scale+bias #[batch_size,sequence_length,d_model]
return output #[batch_size,sequence_length,d_model]
python类rsqrt()的实例源码
a2_layer_norm_residual_conn.py 文件源码
项目:text_classification
作者: brightmart
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
p1_HierarchicalAttention_model_transformer.py 文件源码
项目:text_classification
作者: brightmart
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def layer_normalization(self,x,scope):
"""
x should be:[batch_size,sequence_length,d_model]
:return:[batch_size,sequence_length,d_model]
"""
filter=x.get_shape()[-1] #last dimension of x. e.g. 512
with tf.variable_scope("layer_normalization"+scope):
# 1. normalize input by using mean and variance according to last dimension
mean=tf.reduce_mean(x,axis=-1,keep_dims=True) #[batch_size,sequence_length,1]
variance=tf.reduce_mean(tf.square(x-mean),axis=-1,keep_dims=True) #[batch_size,sequence_length,1]
norm_x=(x-mean)*tf.rsqrt(variance+1e-6) #[batch_size,sequence_length,d_model]
# 2. re-scale normalized input back
scale=tf.get_variable("layer_norm_scale",[filter],initializer=tf.ones_initializer) #[filter]
bias=tf.get_variable("layer_norm_bias",[filter],initializer=tf.ones_initializer) #[filter]
output=norm_x*scale+bias #[batch_size,sequence_length,d_model]
return output #[batch_size,sequence_length,d_model]
def layer_norm_all(h, base, num_units, scope):
# Layer Norm (faster version)
#
# Performs layer norm on multiple base at once (ie, i, g, j, o for lstm)
#
# Reshapes h in to perform layer norm in parallel
with tf.variable_scope(scope):
h_reshape = tf.reshape(h, [-1, base, num_units])
mean = tf.reduce_mean(h_reshape, [2], keep_dims=True)
var = tf.reduce_mean(tf.square(h_reshape - mean), [2], keep_dims=True)
epsilon = tf.constant(1e-3)
rstd = tf.rsqrt(var + epsilon)
h_reshape = (h_reshape - mean) * rstd
# reshape back to original
h = tf.reshape(h_reshape, [-1, base * num_units])
alpha = tf.get_variable('layer_norm_alpha', [4 * num_units],
initializer=tf.constant_initializer(1.0), dtype=tf.float32)
bias = tf.get_variable('layer_norm_bias', [4 * num_units],
initializer=tf.constant_initializer(0.0), dtype=tf.float32)
return (h * alpha) + bias
def diet_expert(x, hidden_size, params):
"""A two-layer feed-forward network with relu activation on hidden layer.
Uses diet variables.
Recompuets hidden layer on backprop to save activation memory.
Args:
x: a Tensor with shape [batch, io_size]
hidden_size: an integer
params: a diet variable HParams object.
Returns:
a Tensor with shape [batch, io_size]
"""
@fn_with_diet_vars(params)
def diet_expert_internal(x):
dim = x.get_shape().as_list()[-1]
h = tf.layers.dense(
x, hidden_size, activation=tf.nn.relu, use_bias=False)
y = tf.layers.dense(h, dim, use_bias=False)
y *= tf.rsqrt(tf.to_float(dim * hidden_size))
return y
return diet_expert_internal(x)
one_shot_learning_network.py 文件源码
项目:MatchingNetworks
作者: AntreasAntoniou
项目源码
文件源码
阅读 87
收藏 0
点赞 0
评论 0
def __call__(self, support_set, input_image, name, training=False):
"""
This module calculates the cosine distance between each of the support set embeddings and the target
image embeddings.
:param support_set: The embeddings of the support set images, tensor of shape [sequence_length, batch_size, 64]
:param input_image: The embedding of the target image, tensor of shape [batch_size, 64]
:param name: Name of the op to appear on the graph
:param training: Flag indicating training or evaluation (True/False)
:return: A tensor with cosine similarities of shape [batch_size, sequence_length, 1]
"""
with tf.name_scope('distance-module' + name), tf.variable_scope('distance-module', reuse=self.reuse):
eps = 1e-10
similarities = []
for support_image in tf.unstack(support_set, axis=0):
sum_support = tf.reduce_sum(tf.square(support_image), 1, keep_dims=True)
support_magnitude = tf.rsqrt(tf.clip_by_value(sum_support, eps, float("inf")))
dot_product = tf.matmul(tf.expand_dims(input_image, 1), tf.expand_dims(support_image, 2))
dot_product = tf.squeeze(dot_product, [1, ])
cosine_similarity = dot_product * support_magnitude
similarities.append(cosine_similarity)
similarities = tf.concat(axis=1, values=similarities)
self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='distance-module')
return similarities
def instance_norm(x,
shift=True,
scale=True,
eps=1e-3,
scope=None,
reuse=None):
# Expect a 4-D Tensor
C = x._shape_as_list()[-1]
with tf.variable_scope(scope, 'instance_norm', reuse=reuse):
# Get mean and variance, normalize input
m, v = tf.nn.moments(x, [1, 2], keep_dims=True)
output = (x - m) * tf.rsqrt(v + eps)
if scale:
output *= tf.get_variable('gamma', C, initializer=tf.ones_initializer)
if shift:
output += tf.get_variable('beta', C, initializer=tf.zeros_initializer)
return output
def scaled_dot_product_attention_simple(q, k, v, bias, name=None):
"""scaled dot-product attention. One head. One spatial dimension.
Args:
q: a Tensor with shape [batch, length_q, depth_k]
k: a Tensor with shape [batch, length_kv, depth_k]
v: a Tensor with shape [batch, length_kv, depth_v]
bias: optional Tensor broadcastable to [batch, length_q, length_kv]
name: an optional string
Returns:
A Tensor.
"""
with tf.variable_scope(
name, default_name="scaled_dot_product_attention_simple"):
scalar = tf.rsqrt(tf.to_float(common_layers.shape_list(q)[2]))
logits = tf.matmul(q * scalar, k, transpose_b=True)
if bias is not None:
logits += bias
weights = tf.nn.softmax(logits, name="attention_weights")
tf.summary.image(
"attention", tf.expand_dims(tf.pow(weights, 0.2), 3), max_outputs=1)
return tf.matmul(weights, v)
def diet_expert(x, hidden_size, params):
"""A two-layer feed-forward network with relu activation on hidden layer.
Uses diet variables.
Recompuets hidden layer on backprop to save activation memory.
Args:
x: a Tensor with shape [batch, io_size]
hidden_size: an integer
params: a diet variable HParams object.
Returns:
a Tensor with shape [batch, io_size]
"""
@fn_with_diet_vars(params)
def diet_expert_internal(x):
dim = x.get_shape().as_list()[-1]
h = tf.layers.dense(x, hidden_size, activation=tf.nn.relu, use_bias=False)
y = tf.layers.dense(h, dim, use_bias=False)
y *= tf.rsqrt(tf.to_float(dim * hidden_size))
return y
return diet_expert_internal(x)
def norm(x):
return x * tf.rsqrt(tf.reduce_mean(tf.square(x), keep_dims=True))
def call(self,inputs):
"""
inputs in as array which contains the support set the embeddings,
the target embedding as the second last value in the array, and true class of target embedding as the last value in the array
"""
similarities = []
targetembedding = inputs[-2] # embedding of the query image
numsupportset = len(inputs)-2
for ii in range(numsupportset):
supportembedding = inputs[ii] # embedding for i^{th} member in the support set
sum_support = tf.reduce_sum(tf.square(supportembedding), 1, keep_dims=True)
supportmagnitude = tf.rsqrt(tf.clip_by_value(sum_support, self.eps, float("inf"))) #reciprocal of the magnitude of the member of the support
sum_query = tf.reduce_sum(tf.square(targetembedding), 1, keep_dims=True)
querymagnitude = tf.rsqrt(tf.clip_by_value(sum_query, self.eps, float("inf"))) #reciprocal of the magnitude of the query image
dot_product = tf.matmul(tf.expand_dims(targetembedding,1),tf.expand_dims(supportembedding,2))
dot_product = tf.squeeze(dot_product,[1])
cosine_similarity = dot_product*supportmagnitude*querymagnitude
similarities.append(cosine_similarity)
similarities = tf.concat(axis=1,values=similarities)
softmax_similarities = tf.nn.softmax(similarities)
preds = tf.squeeze(tf.matmul(tf.expand_dims(softmax_similarities,1),inputs[-1]))
preds.set_shape((inputs[0].shape[0],self.nway))
return preds
def l2_batch_normalize(x, epsilon=1e-12, scope=None):
"""
Helper function to normalize a batch of vectors.
:param x: the input placeholder
:param epsilon: stabilizes division
:return: the batch of l2 normalized vector
"""
with tf.name_scope(scope, "l2_batch_normalize") as scope:
x_shape = tf.shape(x)
x = tf.contrib.layers.flatten(x)
x /= (epsilon + tf.reduce_max(tf.abs(x), 1, keep_dims=True))
square_sum = tf.reduce_sum(tf.square(x), 1, keep_dims=True)
x_inv_norm = tf.rsqrt(np.sqrt(epsilon) + square_sum)
x_norm = tf.multiply(x, x_inv_norm)
return tf.reshape(x_norm, x_shape, scope)
def layer_norm_all(h,
batch_size,
base,
num_units,
scope='layer_norm',
reuse=False,
gamma_start=1.0,
epsilon=1e-3,
use_bias=True):
"""Layer Norm (faster version, but not using defun)."""
# Performs layer norm on multiple base at once (ie, i, g, j, o for lstm)
# Reshapes h in to perform layer norm in parallel
h_reshape = tf.reshape(h, [batch_size, base, num_units])
mean = tf.reduce_mean(h_reshape, [2], keep_dims=True)
var = tf.reduce_mean(tf.square(h_reshape - mean), [2], keep_dims=True)
epsilon = tf.constant(epsilon)
rstd = tf.rsqrt(var + epsilon)
h_reshape = (h_reshape - mean) * rstd
# reshape back to original
h = tf.reshape(h_reshape, [batch_size, base * num_units])
with tf.variable_scope(scope):
if reuse:
tf.get_variable_scope().reuse_variables()
gamma = tf.get_variable(
'ln_gamma', [4 * num_units],
initializer=tf.constant_initializer(gamma_start))
if use_bias:
beta = tf.get_variable(
'ln_beta', [4 * num_units], initializer=tf.constant_initializer(0.0))
if use_bias:
return gamma * h + beta
return gamma * h
def layer_norm(x,
num_units,
scope='layer_norm',
reuse=False,
gamma_start=1.0,
epsilon=1e-3,
use_bias=True):
"""Calculate layer norm."""
axes = [1]
mean = tf.reduce_mean(x, axes, keep_dims=True)
x_shifted = x - mean
var = tf.reduce_mean(tf.square(x_shifted), axes, keep_dims=True)
inv_std = tf.rsqrt(var + epsilon)
with tf.variable_scope(scope):
if reuse is True:
tf.get_variable_scope().reuse_variables()
gamma = tf.get_variable(
'ln_gamma', [num_units],
initializer=tf.constant_initializer(gamma_start))
if use_bias:
beta = tf.get_variable(
'ln_beta', [num_units], initializer=tf.constant_initializer(0.0))
output = gamma * (x_shifted) * inv_std
if use_bias:
output += beta
return output
def l2_normalize(incoming, dim, epsilon=1e-12, name="l2_normalize"):
""" L2 Normalization.
Normalizes along dimension `dim` using an L2 norm.
For a 1-D tensor with `dim = 0`, computes
output = x / sqrt(max(sum(x**2), epsilon))
```
For `x` with more dimensions, independently normalizes each 1-D slice along
dimension `dim`.
Arguments:
incoming: `Tensor`. Incoming Tensor.
dim: `int`. Dimension along which to normalize.
epsilon: `float`. A lower bound value for the norm. Will use
`sqrt(epsilon)` as the divisor if `norm < sqrt(epsilon)`.
name: `str`. A name for this layer (optional).
Returns:
A `Tensor` with the same shape as `x`.
"""
with tf.name_scope(name) as name:
x = tf.convert_to_tensor(incoming, name="x")
square_sum = tf.reduce_sum(tf.square(x), [dim], keep_dims=True)
x_inv_norm = tf.rsqrt(tf.maximum(square_sum, epsilon))
return tf.multiply(x, x_inv_norm, name=name)
```
def batch_norm(x, name="batch_norm"):
eps = 1e-6
with tf.variable_scope(name):
nchannels = x.get_shape()[3]
scale = tf.get_variable("scale", [nchannels], initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32))
center = tf.get_variable("center", [nchannels], initializer=tf.constant_initializer(0.0, dtype = tf.float32))
ave, dev = tf.nn.moments(x, axes=[1,2], keep_dims=True)
inv_dev = tf.rsqrt(dev + eps)
normalized = (x-ave)*inv_dev * scale + center
return normalized
def _instance_norm(input):
""" Instance Normalization
"""
with tf.variable_scope("instance_norm"):
depth = input.get_shape()[3]
scale = _weights("scale", [depth], mean=1.0)
offset = _biases("offset", [depth])
mean, variance = tf.nn.moments(input, axes=[1,2], keep_dims=True)
epsilon = 1e-5
inv = tf.rsqrt(variance + epsilon)
normalized = (input-mean)*inv
return scale*normalized + offset
def layer_norm(inputs, epsilon=1e-6, dtype=None, scope=None):
""" Layer Normalization
Args:
inputs: A Tensor of shape [..., channel_size]
epsilon: A floating number
dtype: An optional instance of tf.DType
scope: An optional string
Returns:
A Tensor with the same shape as inputs
"""
with tf.variable_scope(scope, default_name="layer_norm", values=[inputs],
dtype=dtype):
channel_size = inputs.get_shape().as_list()[-1]
scale = tf.get_variable("scale", shape=[channel_size],
initializer=tf.ones_initializer())
offset = tf.get_variable("offset", shape=[channel_size],
initializer=tf.zeros_initializer())
mean = tf.reduce_mean(inputs, axis=-1, keep_dims=True)
variance = tf.reduce_mean(tf.square(inputs - mean), axis=-1,
keep_dims=True)
norm_inputs = (inputs - mean) * tf.rsqrt(variance + epsilon)
return norm_inputs * scale + offset
def layer_norm_all(h,
batch_size,
base,
num_units,
scope='layer_norm',
reuse=False,
gamma_start=1.0,
epsilon=1e-3,
use_bias=True):
"""Layer Norm (faster version, but not using defun)."""
# Performs layer norm on multiple base at once (ie, i, g, j, o for lstm)
# Reshapes h in to perform layer norm in parallel
h_reshape = tf.reshape(h, [batch_size, base, num_units])
mean = tf.reduce_mean(h_reshape, [2], keep_dims=True)
var = tf.reduce_mean(tf.square(h_reshape - mean), [2], keep_dims=True)
epsilon = tf.constant(epsilon)
rstd = tf.rsqrt(var + epsilon)
h_reshape = (h_reshape - mean) * rstd
# reshape back to original
h = tf.reshape(h_reshape, [batch_size, base * num_units])
with tf.variable_scope(scope):
if reuse:
tf.get_variable_scope().reuse_variables()
gamma = tf.get_variable(
'ln_gamma', [4 * num_units],
initializer=tf.constant_initializer(gamma_start))
if use_bias:
beta = tf.get_variable(
'ln_beta', [4 * num_units], initializer=tf.constant_initializer(0.0))
if use_bias:
return gamma * h + beta
return gamma * h
def layer_norm(x,
num_units,
scope='layer_norm',
reuse=False,
gamma_start=1.0,
epsilon=1e-3,
use_bias=True):
"""Calculate layer norm."""
axes = [1]
mean = tf.reduce_mean(x, axes, keep_dims=True)
x_shifted = x - mean
var = tf.reduce_mean(tf.square(x_shifted), axes, keep_dims=True)
inv_std = tf.rsqrt(var + epsilon)
with tf.variable_scope(scope):
if reuse is True:
tf.get_variable_scope().reuse_variables()
gamma = tf.get_variable(
'ln_gamma', [num_units],
initializer=tf.constant_initializer(gamma_start))
if use_bias:
beta = tf.get_variable(
'ln_beta', [num_units], initializer=tf.constant_initializer(0.0))
output = gamma * (x_shifted) * inv_std
if use_bias:
output += beta
return output
def spatial_batch_norm(input_layer, name='spatial_batch_norm'):
"""
Batch-normalizes the layer as in http://arxiv.org/abs/1502.03167
This is important since it allows the different scales to talk to each other when they get joined.
"""
mean, variance = tf.nn.moments(input_layer, [0, 1, 2])
variance_epsilon = 0.01 # TODO: Check what this value should be
inv = tf.rsqrt(variance + variance_epsilon)
num_channels = input_layer.get_shape().as_list()[3] # TODO: Clean this up
scale = tf.Variable(tf.random_uniform([num_channels]), name='scale') # TODO: How should these initialize?
offset = tf.Variable(tf.random_uniform([num_channels]), name='offset')
return_val = tf.sub(tf.mul(tf.mul(scale, inv), tf.sub(input_layer, mean)), offset, name=name)
return return_val
def _layer_norm_compute_python(x, epsilon, scale, bias):
"""Layer norm raw computation."""
mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
return norm_x * scale + bias
def BatchClipByL2norm(t, upper_bound, name=None):
"""Clip an array of tensors by L2 norm.
Shrink each dimension-0 slice of tensor (for matrix it is each row) such
that the l2 norm is at most upper_bound. Here we clip each row as it
corresponds to each example in the batch.
Args:
t: the input tensor.
upper_bound: the upperbound of the L2 norm.
name: optional name.
Returns:
the clipped tensor.
"""
assert upper_bound > 0
with tf.name_scope(values=[t, upper_bound], name=name,
default_name="batch_clip_by_l2norm") as name:
saved_shape = tf.shape(t)
batch_size = tf.slice(saved_shape, [0], [1])
t2 = tf.reshape(t, tf.concat(axis=0, values=[batch_size, [-1]]))
upper_bound_inv = tf.fill(tf.slice(saved_shape, [0], [1]),
tf.constant(1.0 / upper_bound))
# Add a small number to avoid divide by 0
l2norm_inv = tf.rsqrt(tf.reduce_sum(t2 * t2, [1]) + 0.000001)
scale = tf.minimum(l2norm_inv, upper_bound_inv) * upper_bound
clipped_t = tf.matmul(tf.diag(scale), t2)
clipped_t = tf.reshape(clipped_t, saved_shape, name=name)
return clipped_t
def setUp(self):
super(CoreUnaryOpsTest, self).setUp()
self.ops = [
('abs', operator.abs, tf.abs, core.abs_function),
('neg', operator.neg, tf.neg, core.neg),
# TODO(shoyer): add unary + to core TensorFlow
('pos', None, None, None),
('sign', None, tf.sign, core.sign),
('reciprocal', None, tf.reciprocal, core.reciprocal),
('square', None, tf.square, core.square),
('round', None, tf.round, core.round_function),
('sqrt', None, tf.sqrt, core.sqrt),
('rsqrt', None, tf.rsqrt, core.rsqrt),
('log', None, tf.log, core.log),
('exp', None, tf.exp, core.exp),
('log', None, tf.log, core.log),
('ceil', None, tf.ceil, core.ceil),
('floor', None, tf.floor, core.floor),
('cos', None, tf.cos, core.cos),
('sin', None, tf.sin, core.sin),
('tan', None, tf.tan, core.tan),
('acos', None, tf.acos, core.acos),
('asin', None, tf.asin, core.asin),
('atan', None, tf.atan, core.atan),
('lgamma', None, tf.lgamma, core.lgamma),
('digamma', None, tf.digamma, core.digamma),
('erf', None, tf.erf, core.erf),
('erfc', None, tf.erfc, core.erfc),
('lgamma', None, tf.lgamma, core.lgamma),
]
total_size = np.prod([v.size for v in self.original_lt.axes.values()])
self.test_lt = core.LabeledTensor(
tf.cast(self.original_lt, tf.float32) / total_size,
self.original_lt.axes)
def __call__(self, query, previous_alignments):
'''Score the query based on the keys and values.
Args:
query: Tensor of dtype matching `self.values` and shape
`[batch_size, query_depth]`.
previous_alignments: Tensor of dtype matching `self.values` and shape
`[batch_size, alignments_size]`
(`alignments_size` is memory's `max_time`).
Returns:
alignments: Tensor of dtype matching `self.values` and shape
`[batch_size, alignments_size]` (`alignments_size` is memory's
`max_time`).
'''
with tf.variable_scope(None, 'bahdanau_attention', [query]):
processed_query = self.query_layer(
query) if self.query_layer else query
dtype = processed_query.dtype
# Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
processed_query = tf.expand_dims(processed_query, 1)
if FLAGS.use_conv_feat_att:
conv_feat = tf.nn.conv1d(
tf.expand_dims(previous_alignments, 2),
self.conv_filt, 1, 'SAME')
keys = self._keys
if self._normalize:
# normed_v = g * v / ||v||
normed_v = self.g * self.v * tf.rsqrt(
tf.reduce_sum(tf.square(self.v)))
score = tf.reduce_sum(
normed_v * tf.tanh(keys + processed_query + self.b), [2])
else:
if FLAGS.use_conv_feat_att:
score = tf.reduce_sum(self.v * tf.tanh(keys + processed_query + conv_feat),
[2])
else:
score = tf.reduce_sum(self.v * tf.tanh(keys + processed_query),
[2])
alignments = self._probability_fn(score, previous_alignments)
return alignments
def layer_norm(inputs, epsilon=1e-6, dtype=None, scope=None):
"""
Layer Normalization
:param inputs: A Tensor of shape [..., channel_size]
:param epsilon: A floating number
:param dtype: An optional instance of tf.DType
:param scope: An optional string
:returns: A Tensor with the same shape as inputs
"""
with tf.variable_scope(scope, default_name="layer_norm", values=[inputs],
dtype=dtype):
channel_size = inputs.get_shape().as_list()[-1]
scale = tf.get_variable("scale", shape=[channel_size],
initializer=tf.ones_initializer())
offset = tf.get_variable("offset", shape=[channel_size],
initializer=tf.zeros_initializer())
mean = tf.reduce_mean(inputs, axis=-1, keep_dims=True)
variance = tf.reduce_mean(tf.square(inputs - mean), axis=-1,
keep_dims=True)
norm_inputs = (inputs - mean) * tf.rsqrt(variance + epsilon)
return norm_inputs * scale + offset
def _opsBatchNorm(self, x, m, v, beta, gamma, epsilon,
scale_after_normalization, shift_after_normalization):
y = (x - m) * tf.rsqrt(v + epsilon)
if scale_after_normalization:
y = gamma * y
return y + beta if shift_after_normalization else y
def instance_norm(x, name='instance_norm', reuse=False):
with tf.variable_scope(name, reuse=reuse):
depth = x.get_shape()[3]
scale = tf.get_variable('scale', [depth], initializer=tf.random_normal_initializer(1.0, 0.02))
offset = tf.get_variable('offset', [depth], initializer=tf.constant_initializer(0.0))
mean, variance = tf.nn.moments(x, axes=[1, 2], keep_dims=True)
inv = tf.rsqrt(variance + 1e-5)
normalized = (x - mean) * inv
return scale * normalized + offset
def standardize_images(x):
"""Image standardization on batches (tf.image.per_image_standardization)."""
with tf.name_scope("standardize_images", [x]):
x = tf.to_float(x)
x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True)
x_variance = tf.reduce_mean(
tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True)
x_shape = shape_list(x)
num_pixels = tf.to_float(x_shape[1] * x_shape[2] * 3)
x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
# TODO(lukaszkaiser): remove hack below, needed for greedy decoding for now.
if x.shape and len(x.shape) == 4 and x.shape[3] == 1:
x = tf.concat([x, x, x], axis=3) # Not used, just a dead tf.cond branch.
x.set_shape([None, None, None, 3])
return x
def layer_norm_compute_python(x, epsilon, scale, bias):
"""Layer norm raw computation."""
mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
return norm_x * scale + bias
def simple_attention(target, source, bias=None):
"""A simple attention function.
Args:
target: a `Tensor` with shape `[batch, target_timesteps, depth]` or
`[batch, target_timesteps_1, target_timesteps_2, depth]`
source: a `Tensor` with shape `[batch, source_timesteps, depth]` or
`[batch, source_timesteps_1, source_timesteps_2, depth]`
bias: an optional `Tensor` with shape `[batch, timesteps, 1, 1]` used
to mask the attention to not attend to padding of input.
Returns:
a `Tensor` with same shape as `target`
"""
with tf.name_scope("simple_attention", [target, source]):
target_shape = shape_list(target)
source_shape = shape_list(source)
target = tf.reshape(
target,
[target_shape[0], target_shape[1] * target_shape[2], target_shape[3]])
source = tf.reshape(
source,
[source_shape[0], source_shape[1] * source_shape[2], source_shape[3]])
attention = tf.matmul(target, source, transpose_b=True)
attention *= tf.rsqrt(tf.to_float(shape_list(target)[2]))
if bias is not None:
attention += tf.expand_dims(tf.squeeze(bias, axis=[2, 3]), axis=1)
attention = tf.nn.softmax(attention)
if not tf.get_variable_scope().reuse:
tf.summary.image("attention", tf.expand_dims(attention, 3), max_outputs=5)
attended = tf.matmul(attention, source)
return tf.reshape(attended, target_shape)