def trainable_initial_state(self, batch_size):
"""
Create a trainable initial state for the SkipLSTMCell
:param batch_size: number of samples per batch
:return: SkipLSTMStateTuple
"""
with tf.variable_scope('initial_c'):
initial_c = rnn_ops.create_initial_state(batch_size, self._num_units)
with tf.variable_scope('initial_h'):
initial_h = rnn_ops.create_initial_state(batch_size, self._num_units)
with tf.variable_scope('initial_update_prob'):
initial_update_prob = rnn_ops.create_initial_state(batch_size, 1, trainable=False,
initializer=tf.ones_initializer())
with tf.variable_scope('initial_cum_update_prob'):
initial_cum_update_prob = rnn_ops.create_initial_state(batch_size, 1, trainable=False,
initializer=tf.zeros_initializer())
return SkipLSTMStateTuple(initial_c, initial_h, initial_update_prob, initial_cum_update_prob)
python类ones_initializer()的实例源码
def trainable_initial_state(self, batch_size):
"""
Create a trainable initial state for the MultiSkipGRUCell
:param batch_size: number of samples per batch
:return: list of tensors and SkipGRUStateTuple
"""
initial_states = []
for idx in range(self._num_layers - 1):
with tf.variable_scope('layer_%d' % (idx + 1)):
with tf.variable_scope('initial_h'):
initial_h = rnn_ops.create_initial_state(batch_size, self._num_units[idx])
initial_states.append(initial_h)
with tf.variable_scope('layer_%d' % self._num_layers):
with tf.variable_scope('initial_h'):
initial_h = rnn_ops.create_initial_state(batch_size, self._num_units[-1])
with tf.variable_scope('initial_update_prob'):
initial_update_prob = rnn_ops.create_initial_state(batch_size, 1, trainable=False,
initializer=tf.ones_initializer())
with tf.variable_scope('initial_cum_update_prob'):
initial_cum_update_prob = rnn_ops.create_initial_state(batch_size, 1, trainable=False,
initializer=tf.zeros_initializer())
initial_states.append(SkipGRUStateTuple(initial_h, initial_update_prob, initial_cum_update_prob))
return initial_states
a2_layer_norm_residual_conn.py 文件源码
项目:text_classification
作者: brightmart
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def layer_normalization(self,x):
"""
x should be:[batch_size,sequence_length,d_model]
:return:
"""
filter=x.get_shape()[-1] #last dimension of x. e.g. 512
print("layer_normalization:==================>variable_scope:","layer_normalization"+str(self.layer_index)+self.type)
with tf.variable_scope("layer_normalization"+str(self.layer_index)+self.type):
# 1. normalize input by using mean and variance according to last dimension
mean=tf.reduce_mean(x,axis=-1,keep_dims=True) #[batch_size,sequence_length,1]
variance=tf.reduce_mean(tf.square(x-mean),axis=-1,keep_dims=True) #[batch_size,sequence_length,1]
norm_x=(x-mean)*tf.rsqrt(variance+1e-6) #[batch_size,sequence_length,d_model]
# 2. re-scale normalized input back
scale=tf.get_variable("layer_norm_scale",[filter],initializer=tf.ones_initializer) #[filter]
bias=tf.get_variable("layer_norm_bias",[filter],initializer=tf.ones_initializer) #[filter]
output=norm_x*scale+bias #[batch_size,sequence_length,d_model]
return output #[batch_size,sequence_length,d_model]
p1_HierarchicalAttention_model_transformer.py 文件源码
项目:text_classification
作者: brightmart
项目源码
文件源码
阅读 43
收藏 0
点赞 0
评论 0
def layer_normalization(self,x,scope):
"""
x should be:[batch_size,sequence_length,d_model]
:return:[batch_size,sequence_length,d_model]
"""
filter=x.get_shape()[-1] #last dimension of x. e.g. 512
with tf.variable_scope("layer_normalization"+scope):
# 1. normalize input by using mean and variance according to last dimension
mean=tf.reduce_mean(x,axis=-1,keep_dims=True) #[batch_size,sequence_length,1]
variance=tf.reduce_mean(tf.square(x-mean),axis=-1,keep_dims=True) #[batch_size,sequence_length,1]
norm_x=(x-mean)*tf.rsqrt(variance+1e-6) #[batch_size,sequence_length,d_model]
# 2. re-scale normalized input back
scale=tf.get_variable("layer_norm_scale",[filter],initializer=tf.ones_initializer) #[filter]
bias=tf.get_variable("layer_norm_bias",[filter],initializer=tf.ones_initializer) #[filter]
output=norm_x*scale+bias #[batch_size,sequence_length,d_model]
return output #[batch_size,sequence_length,d_model]
def __init__(self, lr, s_size, a_size):
self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
state_in_OH = slim.one_hot_encoding(self.state_in, s_size)
output = slim.fully_connected(state_in_OH,
a_size,
biases_initializer=None,
activation_fn=tf.nn.sigmoid,
weights_initializer=tf.ones_initializer())
self.output = tf.reshape(output, [-1])
self.chosen_action = tf.argmax(self.output, 0)
self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
self.action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
self.loss = -(tf.log(self.responsible_weight) * self.reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
self.update = optimizer.minimize(self.loss)
def __init__(self, lr, s_size, a_size):
self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
state_in_OH = slim.one_hot_encoding(self.state_in, s_size)
output = slim.fully_connected(state_in_OH,
a_size,
biases_initializer=None,
activation_fn=tf.nn.sigmoid,
weights_initializer=tf.ones_initializer())
self.output = tf.reshape(output, [-1])
self.chosen_action = tf.argmax(self.output, 0)
self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
self.action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
self.loss = -(tf.log(self.responsible_weight) * self.reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
self.update = optimizer.minimize(self.loss)
def add_param(self, spec, shape, name, **kwargs):
param = self.add_param_plain(spec, shape, name, **kwargs)
if name is not None and name.startswith("W") and self.weight_normalization:
# Hacky: check if the parameter is a weight matrix. If so, apply weight normalization
if len(param.get_shape()) == 2:
v = param
g = self.add_param_plain(tf.ones_initializer, (shape[1],), name=name + "_wn/g")
param = v * (tf.reshape(g, (1, -1)) / tf.sqrt(tf.reduce_sum(tf.square(v), 0, keep_dims=True)))
elif len(param.get_shape()) == 4:
v = param
g = self.add_param_plain(tf.ones_initializer, (shape[3],), name=name + "_wn/g")
param = v * (tf.reshape(g, (1, 1, 1, -1)) / tf.sqrt(tf.reduce_sum(tf.square(v), [0, 1, 2],
keep_dims=True)))
else:
raise NotImplementedError
return param
def apply_ln(layer):
def _normalize(x, prefix):
EPS = 1e-5
dim = x.get_shape()[-1].value
bias_name = prefix + "_ln/bias"
scale_name = prefix + "_ln/scale"
if bias_name not in layer.norm_params:
layer.norm_params[bias_name] = layer.add_param(
tf.zeros_initializer, (dim,), name=bias_name, regularizable=False)
if scale_name not in layer.norm_params:
layer.norm_params[scale_name] = layer.add_param(
tf.ones_initializer, (dim,), name=scale_name)
bias = layer.norm_params[bias_name]
scale = layer.norm_params[scale_name]
mean, var = tf.nn.moments(x, axes=[1], keep_dims=True)
x_normed = (x - mean) / tf.sqrt(var + EPS)
return x_normed * scale + bias
return _normalize
def scalar_gating(
net,
activation=tf.nn.relu,
k_initializer=tf.ones_initializer(),
k_regularizer=None,
k_regularizable=False,
):
# Represent this with shape (1,) instead of as a scalar to get proper
# parameter count from tfprof.
k = tf.get_variable(
'k',
(1,),
initializer=k_initializer,
regularizer=k_regularizer,
trainable=True,
)
# Per the paper, we may specifically not want to regularize k.
k.regularizable = k_regularizable
return activation(k) * net
def add_param(self, spec, shape, name, **kwargs):
param = self.add_param_plain(spec, shape, name, **kwargs)
if name is not None and name.startswith("W") and self.weight_normalization:
# Hacky: check if the parameter is a weight matrix. If so, apply weight normalization
if len(param.get_shape()) == 2:
v = param
g = self.add_param_plain(tf.ones_initializer(), (shape[1],), name=name + "_wn/g")
param = v * (tf.reshape(g, (1, -1)) / tf.sqrt(tf.reduce_sum(tf.square(v), 0, keep_dims=True)))
elif len(param.get_shape()) == 4:
v = param
g = self.add_param_plain(tf.ones_initializer(), (shape[3],), name=name + "_wn/g")
param = v * (tf.reshape(g, (1, 1, 1, -1)) / tf.sqrt(tf.reduce_sum(tf.square(v), [0, 1, 2],
keep_dims=True)))
else:
raise NotImplementedError
return param
def apply_ln(layer):
def _normalize(x, prefix):
EPS = 1e-5
dim = x.get_shape()[-1].value
bias_name = prefix + "_ln/bias"
scale_name = prefix + "_ln/scale"
if bias_name not in layer.norm_params:
layer.norm_params[bias_name] = layer.add_param(
ZerosInitializer(), (dim,), name=bias_name, regularizable=False)
if scale_name not in layer.norm_params:
layer.norm_params[scale_name] = layer.add_param(
tf.ones_initializer(), (dim,), name=scale_name)
bias = layer.norm_params[bias_name]
scale = layer.norm_params[scale_name]
mean, var = tf.nn.moments(x, axes=[1], keep_dims=True)
x_normed = (x - mean) / tf.sqrt(var + EPS)
return x_normed * scale + bias
return _normalize
def batch_norm(inputs, cts, ldc, epsilon=0.001, bOffset=True, bScale=True, reuse=None, decay=0.999, is_training=True):
name = get_name('bn', cts)
with tf.variable_scope(name, reuse=reuse):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:]
axis = list(range(len(inputs_shape) - 1))
offset, scale = None, None
if bOffset:
offset = tf.get_variable('offset', shape=params_shape, trainable=True, initializer=tf.zeros_initializer())
if bScale:
scale = tf.get_variable('scale', shape=params_shape, trainable=True, initializer=tf.ones_initializer())
batch_mean, batch_variance = tf.nn.moments(inputs, axis)
outputs = tf.nn.batch_normalization(inputs, batch_mean, batch_variance, offset, scale, epsilon)
# Note: here for fast training we did not do the moving average for testing. which we usually not use.
ldc.append(name + ' offset:' + str(bOffset) + ' scale:' + str(bScale))
return outputs
def batch_norm(inputs, cts, ldc, bOffset=True, bScale=True, epsilon=0.001, reuse=None, decay=0.999, is_training=True):
name = get_name('bn', cts)
with tf.variable_scope(name, reuse=reuse):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:]
axis = list(range(len(inputs_shape) - 1))
offset, scale = None, None
if bOffset:
offset = tf.get_variable('offset', shape=params_shape, trainable=True, initializer=tf.zeros_initializer())
if bScale:
scale = tf.get_variable('scale', shape=params_shape, trainable=True, initializer=tf.ones_initializer())
batch_mean, batch_variance = tf.nn.moments(inputs, axis)
outputs = tf.nn.batch_normalization(inputs, batch_mean, batch_variance, offset, scale, epsilon)
# Note: here for fast training we did not do the moving average (for testing). which we usually not use.
ldc.append(name + ' offset:' + str(bOffset) + ' scale:' + str(bScale))
return outputs
def normalization(inputs, epsilon=1e-3, has_shift=True, has_scale=True,
activation_fn=None, scope='normalization'):
with tf.variable_scope(scope):
inputs_shape = inputs.get_shape()
inputs_rank = inputs_shape.ndims
axis = list(range(inputs_rank - 1))
mean, variance = tf.nn.moments(inputs, axis)
shift, scale = None, None
if has_shift:
shift = tf.get_variable('shift',
shape=inputs_shape[-1:],
dtype=inputs.dtype,
initializer=tf.zeros_initializer)
if has_scale:
scale = tf.get_variable('scale',
shape=inputs_shape[-1:],
dtype=inputs.dtype,
initializer=tf.ones_initializer)
x = tf.nn.batch_normalization(inputs, mean, variance, shift, scale, epsilon)
return x if activation_fn is None else activation_fn(x)
rnn_cell_modern.py 文件源码
项目:tensorflow_with_latest_papers
作者: NickShahML
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def _inner_function(self, inputs, past_hidden_state, activation=tf.nn.tanh):
"""second order function as described equation 11 in delta rnn paper
The main goal is to produce z_t of this function
"""
V_x_d = linear(past_hidden_state, self._num_units, True)
# We make this a private variable to be reused in the _outer_function
self._W_x_inputs = linear(inputs, self._num_units, True)
alpha = tf.get_variable("alpha", [self._num_units], dtype=tf.float32, initializer=tf.ones_initializer)
beta_one = tf.get_variable("beta_one", [self._num_units], dtype=tf.float32, initializer=tf.ones_initializer)
beta_two = tf.get_variable("beta_two", [self._num_units], dtype=tf.float32, initializer=tf.ones_initializer)
z_t_bias = tf.get_variable("z_t_bias", [self._num_units], dtype=tf.float32, initializer=tf.zeros_initializer)
# Second Order Cell Calculations
d_1_t = alpha * V_x_d * self._W_x_inputs
d_2_t = beta_one * V_x_d + beta_two * self._W_x_inputs
z_t = activation(d_1_t + d_2_t + z_t_bias)
return z_t
def call(self, inputs, state):
"""Gated recurrent unit (GRU) with nunits cells."""
with tf.variable_scope('layer_normalization'):
gain1 = tf.get_variable('gain1', shape=[2*self._num_units], initializer=tf.ones_initializer())
bias1 = tf.get_variable('bias1', shape=[2*self._num_units], initializer=tf.zeros_initializer())
gain2 = tf.get_variable('gain2', shape=[self._num_units], initializer=tf.ones_initializer())
bias2 = tf.get_variable('bias2', shape=[self._num_units], initializer=tf.zeros_initializer())
with vs.variable_scope("gates"): # Reset gate and update gate.
# We start with bias of 1.0 to not reset and not update.
bias_ones = self._bias_initializer
if self._bias_initializer is None:
dtype = [a.dtype for a in [inputs, state]][0]
bias_ones = tf.constant_initializer(1.0, dtype=dtype)
value = tf.nn.sigmoid(ln(
_linear([inputs, state], 2 * self._num_units, True, bias_ones,
self._kernel_initializer), gain1, bias1))
r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
with vs.variable_scope("candidate"):
c = self._activation(ln(
_linear([inputs, r * state], self._num_units, True,
self._bias_initializer, self._kernel_initializer), gain2, bias2))
new_h = u * state + (1 - u) * c
return new_h, new_h
def create_graph(device0, device1):
"""Create graph that keeps var1 on device0, var2 on device1 and adds them"""
tf.reset_default_graph()
dtype=tf.int32
params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers
with tf.device(device0):
var1 = tf.get_variable("var1", [params_size], dtype,
initializer=tf.ones_initializer())
with tf.device(device1):
var2 = tf.get_variable("var2", [params_size], dtype,
initializer=tf.ones_initializer())
add_op = var1.assign_add(var2)
init_op = tf.global_variables_initializer()
return init_op, add_op
def create_graph(device1, device2):
"""Create graph that keeps variable on device1 and
vector of ones/addition op on device2"""
tf.reset_default_graph()
dtype=tf.int32
params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers
with tf.device(device1):
params = tf.get_variable("params", [params_size], dtype,
initializer=tf.zeros_initializer)
with tf.device(device2):
# constant node gets placed on device1 because of simple_placer
# update = tf.constant(1, shape=[params_size], dtype=dtype)
update = tf.get_variable("update", [params_size], dtype,
initializer=tf.ones_initializer)
add_op = params.assign_add(update)
init_op = tf.initialize_all_variables()
return init_op, add_op
def create_graph(device0, device1):
"""Create graph that keeps var1 on device0, var2 on device1 and adds them"""
tf.reset_default_graph()
dtype=tf.int32
params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers
with tf.device(device0):
var1 = tf.get_variable("var1", [params_size], dtype,
initializer=tf.ones_initializer())
with tf.device(device1):
var2 = tf.get_variable("var2", [params_size], dtype,
initializer=tf.ones_initializer())
add_op = var1.assign_add(var2)
init_op = tf.global_variables_initializer()
return init_op, add_op
def add_param(self, spec, shape, name, **kwargs):
param = self.add_param_plain(spec, shape, name, **kwargs)
if name is not None and name.startswith("W") and self.weight_normalization:
# Hacky: check if the parameter is a weight matrix. If so, apply
# weight normalization
if len(param.get_shape()) == 2:
v = param
g = self.add_param_plain(
tf.ones_initializer, (shape[1],), name=name + "_wn/g")
param = v * (tf.reshape(g, (1, -1)) /
tf.sqrt(tf.reduce_sum(tf.square(v), 0, keep_dims=True)))
elif len(param.get_shape()) == 4:
v = param
g = self.add_param_plain(
tf.ones_initializer, (shape[3],), name=name + "_wn/g")
param = v * (tf.reshape(g, (1, 1, 1, -1)) / tf.sqrt(tf.reduce_sum(tf.square(v), [0, 1, 2],
keep_dims=True)))
else:
raise NotImplementedError
return param
def apply_ln(layer):
def _normalize(x, prefix):
EPS = 1e-5
dim = x.get_shape()[-1].value
bias_name = prefix + "_ln/bias"
scale_name = prefix + "_ln/scale"
if bias_name not in layer.norm_params:
layer.norm_params[bias_name] = layer.add_param(
tf.zeros_initializer, (dim,), name=bias_name, regularizable=False)
if scale_name not in layer.norm_params:
layer.norm_params[scale_name] = layer.add_param(
tf.ones_initializer, (dim,), name=scale_name)
bias = layer.norm_params[bias_name]
scale = layer.norm_params[scale_name]
mean, var = tf.nn.moments(x, axes=[1], keep_dims=True)
x_normed = (x - mean) / tf.sqrt(var + EPS)
return x_normed * scale + bias
return _normalize
def __init__(self, lr, s_size,a_size):
#These lines established the feed-forward part of the network. The agent takes a state and produces an action.
self.state_in= tf.placeholder(shape=[1],dtype=tf.int32)
state_in_OH = slim.one_hot_encoding(self.state_in,s_size)
output = slim.fully_connected(state_in_OH,a_size,\
biases_initializer=None,activation_fn=tf.nn.sigmoid,weights_initializer=tf.ones_initializer())
self.output = tf.reshape(output,[-1])
self.chosen_action = tf.argmax(self.output,0)
#The next six lines establish the training proceedure. We feed the reward and chosen action into the network
#to compute the loss, and use it to update the network.
self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
self.update = optimizer.minimize(self.loss)
def testWhileLoopProblem(self):
"""Tests L2L applied to problem with while loop."""
def while_loop_problem():
x = tf.get_variable("x", shape=[], initializer=tf.ones_initializer())
# Strange way of squaring the variable.
_, x_squared = tf.while_loop(
cond=lambda t, _: t < 1,
body=lambda t, x: (t + 1, x * x),
loop_vars=(0, x),
name="loop")
return x_squared
optimizer = meta.MetaOptimizer(net=dict(
net="CoordinateWiseDeepLSTM",
net_options={"layers": ()}))
minimize_ops = optimizer.meta_minimize(while_loop_problem, 3)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
train(sess, minimize_ops, 1, 2)
def batchnorm(input, orig_graph, is_training):
return tfl.batch_norm(
input,
decay=0.9,
scale=True,
epsilon=1E-5,
activation_fn=None,
param_initializers={
'beta': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/beta'),
'gamma': get_val_or_initializer(orig_graph,
tf.random_normal_initializer(1.0,
0.02),
'BatchNorm/gamma'),
'moving_mean': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/moving_mean'),
'moving_variance': get_val_or_initializer(orig_graph,
tf.ones_initializer(),
'BatchNorm/moving_variance')
},
is_training=is_training,
fused=True, # new implementation with a fused kernel => speedup.
)
def batchnorm(input, orig_graph, is_training):
return tfl.batch_norm(
input,
decay=0.9,
scale=True,
epsilon=1E-5,
activation_fn=None,
param_initializers={
'beta': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/beta'),
'gamma': get_val_or_initializer(orig_graph,
tf.random_normal_initializer(1.0,
0.02),
'BatchNorm/gamma'),
'moving_mean': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/moving_mean'),
'moving_variance': get_val_or_initializer(orig_graph,
tf.ones_initializer(),
'BatchNorm/moving_variance')
},
is_training=is_training,
fused=True, # new implementation with a fused kernel => speedup.
)
def batchnorm(input, orig_graph, is_training):
return tfl.batch_norm(
input,
decay=0.9,
scale=True,
epsilon=1E-5,
activation_fn=None,
param_initializers={
'beta': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/beta'),
'gamma': get_val_or_initializer(orig_graph,
tf.random_normal_initializer(1.0,
0.02),
'BatchNorm/gamma'),
'moving_mean': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/moving_mean'),
'moving_variance': get_val_or_initializer(orig_graph,
tf.ones_initializer(),
'BatchNorm/moving_variance')
},
is_training=is_training,
fused=True, # new implementation with a fused kernel => speedup.
)
def batchnorm(input, orig_graph, is_training):
return tfl.batch_norm(
input,
decay=0.9,
scale=True,
epsilon=1E-5,
activation_fn=None,
param_initializers={
'beta': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/beta'),
'gamma': get_val_or_initializer(orig_graph,
tf.random_normal_initializer(1.0,
0.02),
'BatchNorm/gamma'),
'moving_mean': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/moving_mean'),
'moving_variance': get_val_or_initializer(orig_graph,
tf.ones_initializer(),
'BatchNorm/moving_variance')
},
is_training=is_training,
fused=True, # new implementation with a fused kernel => speedup.
)
def batchnorm(input, orig_graph, is_training):
return tfl.batch_norm(
input,
decay=0.9,
scale=True,
epsilon=1E-5,
activation_fn=None,
param_initializers={
'beta': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/beta'),
'gamma': get_val_or_initializer(orig_graph,
tf.random_normal_initializer(1.0,
0.02),
'BatchNorm/gamma'),
'moving_mean': get_val_or_initializer(orig_graph,
tf.constant_initializer(0.),
'BatchNorm/moving_mean'),
'moving_variance': get_val_or_initializer(orig_graph,
tf.ones_initializer(),
'BatchNorm/moving_variance')
},
is_training=is_training,
fused=True, # new implementation with a fused kernel => speedup.
)
def instance_norm(x,
shift=True,
scale=True,
eps=1e-3,
scope=None,
reuse=None):
# Expect a 4-D Tensor
C = x._shape_as_list()[-1]
with tf.variable_scope(scope, 'instance_norm', reuse=reuse):
# Get mean and variance, normalize input
m, v = tf.nn.moments(x, [1, 2], keep_dims=True)
output = (x - m) * tf.rsqrt(v + eps)
if scale:
output *= tf.get_variable('gamma', C, initializer=tf.ones_initializer)
if shift:
output += tf.get_variable('beta', C, initializer=tf.zeros_initializer)
return output
def lookup_shift(x,
context,
shift=True,
scale=True,
scope=None,
reuse=None):
B = context._shape_as_list()[-1]
C = x._shape_as_list()[-1]
ndim = len(x.shape)
var_shape = [B] + [1] * (ndim - 2) + [C]
with tf.variable_scope(scope, 'lookup_shift', reuse=reuse):
output = x
ids = tf.argmax(context, -1)
if scale:
gamma = tf.get_variable('gamma', var_shape, initializer=tf.ones_initializer)
output *= tf.nn.embedding_lookup(gamma, ids)
if shift:
beta = tf.get_variable('beta', var_shape, initializer=tf.zeros_initializer)
output += tf.nn.embedding_lookup(beta, ids)
return output