def test_gt_grad():
"""A user test that failed.
Something about it made Elemwise.grad return something that was
too complicated for get_scalar_constant_value to recognize as being 0, so
gradient.grad reported that it was not a valid gradient of an
integer.
"""
floatX = config.floatX
T = theano.tensor
input_ = T.vector(dtype=floatX)
random_values = numpy.random.RandomState(1234).uniform(
low=-1, high=1, size=(2, 2))
W_values = numpy.asarray(random_values, dtype=floatX)
W = theano.shared(value=W_values, name='weights')
correct_score = T.dot(input_, W)
wrong_input = T.vector(dtype=floatX)
wrong_score = theano.clone(correct_score, {input_: wrong_input})
# Hinge loss
scores = T.ones_like(correct_score) - correct_score + wrong_score
cost = (scores * (scores > 0)).sum()
T.grad(cost, input_)
python类clone()的实例源码
def infer_shape(self, node, shapes):
out_shp = theano.scan_module.scan_utils.infer_shape(self.new_outputs,
self.new_inputs,
shapes)
# Clone the output shape so that shape are computed from outer inputs.
# Note:
# Here we can do it more simply like:
# ret = [theano.clone(shp, replace=repl) for shp in out_shp]
# But doing it multiple time could duplicate common subgraph between
# each shape call. Theano optimizer will clean this up later, but this
# will ask extra work to the optimizer.
repl = dict(zip(self.new_inputs, node.inputs))
cloned = theano.clone(reduce(tuple.__add__, out_shp), replace=repl)
ret = []
used = 0
for i in range(len(out_shp)):
nb = len(out_shp[i])
ret.append(cloned[used: used + nb])
used += nb
return ret
def reconstruct_graph(inputs, outputs, tag=None):
"""
Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with
new variables of the same type, and returns those (in the same
order as the original inputs).
"""
if tag is None:
tag = ''
nw_inputs = [safe_new(x, tag) for x in inputs]
givens = OrderedDict()
for nw_x, x in izip(nw_inputs, inputs):
givens[x] = nw_x
allinputs = theano.gof.graph.inputs(outputs)
for inp in allinputs:
if isinstance(inp, theano.Constant):
givens[inp] = inp.clone()
nw_outputs = clone(outputs, replace=givens)
return (nw_inputs, nw_outputs)
def test_cloning_no_replace_strict_copy_inputs(self):
# This has nothing to do with scan, but it refers to the clone
# function that scan uses internally and that pfunc uses now and
# that users might want to use
x = theano.tensor.vector('x')
y = theano.tensor.vector('y')
z = theano.shared(0.25)
f1 = z * (x + y) ** 2 + 5
f2 = theano.clone(f1,
replace=None,
strict=True,
share_inputs=True)
f2_inp = theano.gof.graph.inputs([f2])
assert z in f2_inp
assert x in f2_inp
assert y in f2_inp
def test_cloning_no_replace_strict_not_copy_inputs(self):
# This has nothing to do with scan, but it refers to the clone
# function that scan uses internally and that pfunc uses now and
# that users might want to use
x = theano.tensor.vector('x')
y = theano.tensor.vector('y')
z = theano.shared(0.25)
f1 = z * (x + y) ** 2 + 5
f2 = theano.clone(f1,
replace=None,
strict=True,
share_inputs=False)
f2_inp = theano.gof.graph.inputs([f2])
assert not z in f2_inp
assert not x in f2_inp
assert not y in f2_inp
def test_cloning_replace_not_strict_copy_inputs(self):
# This has nothing to do with scan, but it refers to the clone
# function that scan uses internally and that pfunc uses now and
# that users might want to use
x = theano.tensor.vector('x')
y = theano.tensor.fvector('y')
y2 = theano.tensor.dvector('y2')
z = theano.shared(0.25)
f1 = z * (x + y) ** 2 + 5
f2 = theano.clone(f1,
replace=OrderedDict([(y, y2)]),
strict=False,
share_inputs=True)
f2_inp = theano.gof.graph.inputs([f2])
assert z in f2_inp
assert x in f2_inp
assert y2 in f2_inp
def test_cloning_replace_strict_not_copy_inputs(self):
# This has nothing to do with scan, but it refers to the clone
# function that scan uses internally and that pfunc uses now and
# that users might want to use
x = theano.tensor.vector('x')
y = theano.tensor.vector('y')
y2 = theano.tensor.vector('y2')
z = theano.shared(0.25)
f1 = z * (x + y) ** 2 + 5
f2 = theano.clone(f1,
replace=[(y, y2)],
strict=True,
share_inputs=False)
f2_inp = theano.gof.graph.inputs([f2])
assert not z in f2_inp
assert not x in f2_inp
assert not y2 in f2_inp
def test_cloning_replace_not_strict_not_copy_inputs(self):
# This has nothing to do with scan, but it refers to the clone
# function that scan uses internally and that pfunc uses now and
# that users might want to use
x = theano.tensor.vector('x')
y = theano.tensor.fvector('y')
y2 = theano.tensor.dvector('y2')
z = theano.shared(0.25)
f1 = z * (x + y) ** 2 + 5
f2 = theano.clone(f1,
replace=[(y, y2)],
strict=False,
share_inputs=False)
f2_inp = theano.gof.graph.inputs([f2])
assert not z in f2_inp
assert not x in f2_inp
assert not y2 in f2_inp
# TEST RE-ordering of inputs
# some rnn with multiple outputs and multiple inputs; other
# dimension instead of scalars/vectors
def clone(**new_inputs):
new_obj = utils.copy(self)
# Reorder inputs
assert len(new_obj.inputs) == len(new_inputs.items())
pairs=[(x, new_inputs[x.name]) for x in inputs]
new_obj.inputs = new_inputs.values()
new_obj.out = theano.clone(new_obj.out, replace=pairs)
if hasattr(new_obj, 'cost'):
new_obj.cost = theano.clone(new_obj.cost, replace=pairs)
if hasattr(new_obj, 'grads'):
new_obj.grads = theano.clone(new_obj.grads, replace=pairs)
if hasattr(new_obj, 'sample'):
new_obj.sample = theano.clone(new_obj.sample, replace=pairs)
return new_obj
def clone(**new_inputs):
new_obj = utils.copy(self)
# Reorder inputs
assert len(new_obj.inputs) == len(new_inputs.items())
pairs=[(x, new_inputs[x.name]) for x in inputs]
new_obj.inputs = new_inputs.values()
new_obj.out = theano.clone(new_obj.out, replace=pairs)
if hasattr(new_obj, 'cost'):
new_obj.cost = theano.clone(new_obj.cost, replace=pairs)
if hasattr(new_obj, 'grads'):
new_obj.grads = theano.clone(new_obj.grads, replace=pairs)
if hasattr(new_obj, 'sample'):
new_obj.sample = theano.clone(new_obj.sample, replace=pairs)
return new_obj
def __call__(self, cost, params):
grads = T.grad(cost=cost ,wrt=params)
updates = []
for p, g in zip(params, grads):
v = theano.shared(p.get_value() * 0.)
new_v = self.mu * v + self.lr * theano.clone(g, replace = {p: p - self.mu * v})
updates.append((v, new_v))
updates.append((p, p - new_v))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic or self.fixed:
# use stored mean and std
mean = self.mean
std = self.std
else:
# use this batch's mean and std
mean = input.mean(self.axes, keepdims=True)
#std = input.std(self.axes, keepdims=True)
std = (input.var(self.axes, keepdims=True)+self.epsilon).sqrt()
# and update the stored mean and std:
# we create (memory-aliased) clones of the stored mean and std
running_mean = theano.clone(self.mean, share_inputs=False)
running_std = theano.clone(self.std, share_inputs=False)
# set a default update for them
running_mean.default_update = ((1 - self.alpha) * running_mean +
self.alpha * mean)
running_std.default_update = ((1 - self.alpha) * running_std +
self.alpha * std)
# and include them in the graph so their default updates will be
# applied (although the expressions will be optimized away later)
mean += 0 * running_mean
std += 0 * running_std
#std += self.epsilon
mean = T.addbroadcast(mean, *self.axes)
std = T.addbroadcast(std, *self.axes)
beta = T.addbroadcast(self.beta, *self.axes)
gamma = T.addbroadcast(self.gamma, *self.axes)
# normalized = (input - mean) * (gamma / std) + beta
normalized = (input - mean) / std
if self.rescale:
normalized = normalized * gamma + beta
return self.nonlinearity(normalized)
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
# use stored mean and std
mean = self.mean
std = self.std
else:
# use this batch's mean and std
mean = input.mean(self.axes, keepdims=True)
std = input.std(self.axes, keepdims=True)
# and update the stored mean and std:
# we create (memory-aliased) clones of the stored mean and std
running_mean = theano.clone(self.mean, share_inputs=False)
running_std = theano.clone(self.std, share_inputs=False)
# set a default update for them
running_mean.default_update = ((1 - self.alpha) * running_mean +
self.alpha * mean)
running_std.default_update = ((1 - self.alpha) * running_std +
self.alpha * std)
# and include them in the graph so their default updates will be
# applied (although the expressions will be optimized away later)
mean += 0 * running_mean
std += 0 * running_std
std += self.epsilon
mean = T.addbroadcast(mean, *self.axes)
std = T.addbroadcast(std, *self.axes)
beta = T.addbroadcast(self.beta, *self.axes)
gamma = T.addbroadcast(self.gamma, *self.axes)
normalized = (input - mean) * (gamma / std) + beta
return self.nonlinearity(normalized)
def convolve(self, input, deterministic=False, **kwargs):
""" Binary convolution. Both inputs and weights are binary (+1 or -1)
This overrides convolve operation from Conv2DLayer implementation
"""
if(self.xnor):
# compute the binary inputs H and the scaling matrix K
input, K = binarize_conv_input(input, self.beta_filter)
# Compute the binarized filters are the scaling matrix
self.Wb, alpha = binarize_conv_filters(self.W)
if not deterministic:
old_alpha = theano.clone(self.xalpha, share_inputs=False)
old_alpha.default_update = alpha
alpha += 0*old_alpha
else:
alpha = self.xalpha
# TODO: Use XNOR ops for the convolution. As of now using Lasagne's convolution for
# functionality verification.
# approx weight tensor
#W_full_precision = self.Wb * alpha.dimshuffle(0, 'x', 'x', 'x')
Wr = self.W
self.W = self.Wb
feat_maps = super(Conv2DLayer, self).convolve(input, **kwargs)
# restore the approx full precision weight for gradiant computation
#self.W = W_full_precision
self.W = Wr
# scale by K and alpha
# FIXME: Actually we are scaling after adding bias here. Need to scale first and then add bias.
# The super class method automatically adds bias. Somehow need to overcome this..
# may subtract the bias, scale by alpha and beta ans then add bias ?
feat_maps = feat_maps * K
feat_maps = feat_maps * alpha.dimshuffle('x', 0, 'x', 'x')
else:
feat_maps = super(Conv2DLayer, self).convolve(input, **kwargs)
return feat_maps
def get_output_for(self, input, deterministic=False, **kwargs):
""" Binary dense layer dot product computation
"""
if(self.xnor):
# binarize the input
bin_input, beta = binarize_fc_input(input)
# compute weight scaling factor.
self.Wb, alpha = binarize_fc_weights(self.W)
if not deterministic:
old_alpha = theano.clone(self.xalpha, share_inputs=False)
old_alpha.default_update = alpha
alpha += 0*old_alpha
else:
alpha = self.xalpha
#W_full_precision = self.Wb * alpha.dimshuffle('x', 0)
Wr = self.W
self.W = self.Wb
fc_out = super(DenseLayer, self).get_output_for(bin_input, **kwargs)
# scale the output by alpha and beta
# FIXME: Actually we are scaling after adding bias here. Need to scale first and then add bias.
# The super class method automatically adds bias. Somehow need to overcome this..
# may subtract the bias, scale by alpha and beta ans then add bias ?
fc_out = fc_out * beta.dimshuffle(0, 'x')
fc_out = fc_out * alpha.dimshuffle('x', 0)
#self.W = W_full_precision
self.W = Wr
else:
fc_out = super(DenseLayer, self).get_output_for(input, **kwargs)
return fc_out
# find the dot product
# scale the output by alpha and beta
def get_dOmega_dWrec(self, loss, x):
# Pascanu's trick
scan_node = x.owner.inputs[0].owner
assert isinstance(scan_node.op, theano.scan_module.scan_op.Scan)
npos = scan_node.op.n_seqs + 1
init_x = scan_node.inputs[npos]
g_x = theano.grad(loss, init_x)
# To force immediate derivatives
d_xt = T.tensor3('d_xt')
xt = T.tensor3('xt')
# Vanishing-gradient regularization
self.bound = 1e-20
self.lambda_Omega = 2
# Wrec
Wrec = self.params['Wrec']
# Numerator
alpha = self.alpha
num = (1 - alpha)*d_xt[1:] + T.dot(alpha*d_xt[1:], Wrec.T)*self.df_hidden(xt)
num = (num**2).sum(axis=2)
# Denominator
denom = (d_xt[1:]**2).sum(axis=2)
# Omega
bound = self.bound
Omega = (T.switch(T.ge(denom, bound), num/denom, 1) - 1)**2
nelems = T.mean(T.ge(denom, bound), axis=1)
Omega = Omega.mean(axis=1).sum()/nelems.sum()
# Gradient w.r.t Wrec
g_Wrec = theano.grad(Omega, Wrec)
g_Wrec = theano.clone(g_Wrec, replace=[(d_xt, g_x), (xt, x)])
return self.lambda_Omega * g_Wrec
def forced_replace(out, x, y):
"""
Check all internal values of the graph that compute the variable ``out``
for occurrences of values identical with ``x``. If such occurrences are
encountered then they are replaced with variable ``y``.
Parameters
----------
out : Theano Variable
x : Theano Variable
y : Theano Variable
Examples
--------
out := sigmoid(wu)*(1-sigmoid(wu))
x := sigmoid(wu)
forced_replace(out, x, y) := y*(1-y)
"""
if out is None:
return None
# ``visited`` is a set of nodes that are already known and don't need to be
# checked again, speeding up the traversal of multiply-connected graphs.
visited = set()
def local_traverse(graph, x):
if graph in visited:
return []
visited.add(graph)
if equal_computations([graph], [x]):
return [graph]
elif not graph.owner:
return []
else:
rval = []
for inp in graph.owner.inputs:
rval += local_traverse(inp, x)
return rval
to_replace = local_traverse(out, x)
return clone(out, replace=OrderedDict((v, y) for v in to_replace))
def test_inplace3(self):
rng = numpy.random.RandomState(utt.fetch_seed())
vx0 = asarrayX(rng.uniform())
vx1 = asarrayX(rng.uniform())
x0 = theano.shared(vx0)
x1 = theano.shared(vx1)
outputs, updates = theano.scan(lambda x, y: (x + asarrayX(1),
y + asarrayX(1)),
[],
[x0, x1],
n_steps=3)
x0 = asarrayX(numpy.zeros((3,)))
x0[0] = vx0
x0 = theano.tensor.constant(x0)
to_replace = outputs[0].owner.inputs[0].owner.inputs[1]
outputs = theano.clone(outputs,
replace=[(to_replace, x0)])
mode = theano.compile.mode.get_mode(None).including('inplace')
f9 = theano.function([],
outputs,
updates=updates,
mode=mode)
scan_node = [x for x in f9.maker.fgraph.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 not in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# Shared variable with updates
def test_clone(self):
def test(x, y, mention_y):
if mention_y:
d = 0.1 + 0 * y
else:
d = 0.1
out = theano.clone(y, replace={x: x + d})
# theano.printing.debugprint(out)
return theano.function([], out)()
x = theano.shared(numpy.asarray(0., dtype=theano.config.floatX))
utt.assert_allclose(test(x, tensor.sum((x+1)**2), mention_y=False),
1.21000003815)
utt.assert_allclose(test(x, tensor.sum((x+1)**2), mention_y=True),
1.21000003815)
def clone(**new_inputs):
new_obj = utils.copy(self)
# Reorder inputs
assert len(new_obj.inputs) == len(new_inputs.items())
pairs=[(x, new_inputs[x.name]) for x in inputs]
new_obj.inputs = new_inputs.values()
new_obj.out = theano.clone(new_obj.out, replace=pairs)
if hasattr(new_obj, 'cost'):
new_obj.cost = theano.clone(new_obj.cost, replace=pairs)
if hasattr(new_obj, 'grads'):
new_obj.grads = theano.clone(new_obj.grads, replace=pairs)
if hasattr(new_obj, 'sample'):
new_obj.sample = theano.clone(new_obj.sample, replace=pairs)
return new_obj
def clone(**new_inputs):
new_obj = utils.copy(self)
# Reorder inputs
assert len(new_obj.inputs) == len(new_inputs.items())
pairs=[(x, new_inputs[x.name]) for x in inputs]
new_obj.inputs = new_inputs.values()
new_obj.out = theano.clone(new_obj.out, replace=pairs)
if hasattr(new_obj, 'cost'):
new_obj.cost = theano.clone(new_obj.cost, replace=pairs)
if hasattr(new_obj, 'grads'):
new_obj.grads = theano.clone(new_obj.grads, replace=pairs)
if hasattr(new_obj, 'sample'):
new_obj.sample = theano.clone(new_obj.sample, replace=pairs)
return new_obj
def pseudograd(loss, params, srng=None, temperature = 1.0e-1,
learning_rate=1.0e-2, rho2=0.95):
one = T.constant(1.0)
zero = T.constant(0.0)
deltas = [ make_normal(param, srng=srng) for param in params ]
momentum = [ make_copy(param) for param in params ]
new_params = [
param + learning_rate * delta
for param, delta, m in zip(params, deltas, momentum)
]
new_loss = theano.clone(
loss, replace=dict(zip(params, new_params))
)
accepting_p = T.exp((loss - new_loss) / temperature)
u = srng.uniform(size=(), dtype=loss.dtype)
cond = T.or_(T.or_(u > accepting_p, T.isnan(new_loss)), T.isinf(new_loss))
step = T.switch(cond, zero, one)
updates = OrderedDict()
for m, delta in zip(momentum, deltas):
updates[m] = m * rho2 + (one - rho2) * delta * step
for param, m in zip(params, momentum):
updates[param] = param + learning_rate * m
return updates
def add_layer(self, new_layer):
'''Adds the given layer to the network'''
self.layers.append(new_layer)
self.output = theano.clone(new_layer.output, replace={new_layer.input: self.output})
self.size += new_layer.size
def get_output_for(self, input, deterministic=False, **kwargs):
input_mean = input.mean(self.axes)
input_std = TT.sqrt(input.var(self.axes) + self.epsilon)
# Decide whether to use the stored averages or mini-batch statistics
use_averages = kwargs.get('batch_norm_use_averages',
deterministic)
if use_averages:
mean = self.mean
std = self.std
else:
mean = input_mean
std = input_std
# Decide whether to update the stored averages
update_averages = kwargs.get('batch_norm_update_averages',
not deterministic)
if update_averages:
# Trick: To update the stored statistics, we create memory-aliased
# clones of the stored statistics:
running_mean = theano.clone(self.mean, share_inputs=False)
running_std = theano.clone(self.std, share_inputs=False)
# set a default update for them:
running_mean.default_update = ((1 - self.alpha) * running_mean +
self.alpha * input_mean)
running_std.default_update = ((1 - self.alpha) *
running_std +
self.alpha * input_std)
# and make sure they end up in the graph without participating in
# the computation (this way their default_update will be collected
# and applied, but the computation will be optimized away):
mean += 0 * running_mean
std += 0 * running_std
# prepare dimshuffle pattern inserting broadcastable axes as needed
param_axes = iter(list(range(input.ndim - len(self.axes))))
pattern = ['x' if input_axis in self.axes
else next(param_axes)
for input_axis in range(input.ndim)]
# apply dimshuffle pattern to all parameters
beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
mean = mean.dimshuffle(pattern)
std = std.dimshuffle(pattern)
# normalize
normalized = (input - mean) * (gamma * TT.inv(std)) + beta
return normalized
def get_grads(self, state_below, target, mask = None, reg = None,
scale=None, sum_over_time=True, use_noise=True,
additional_inputs=None):
"""
This function implements both the forward and backwards pass of this
layer. The reason we do this in a single function is because for the
factorized softmax layer is hard to rely on grad and get an
optimized graph. For uniformity I've implemented this method for
this layer as well (though one doesn't need to use it)
:param state_below: theano variable representing the input to the
softmax layer
:param target: theano variable representing the target for this
layer
:return: cost, dC_dstate_below, param_grads, new_properties
dC_dstate_below is a computational graph representing the
gradient of the cost wrt to state_below
param_grads is a list containing the gradients wrt to the
different parameters of the layer
new_properties is a dictionary containing additional properties
of the model; properties are theano expression that are
evaluated and reported by the model
"""
cost = self.get_cost(state_below,
target,
mask = mask,
reg = reg,
scale=scale,
sum_over_time=sum_over_time,
use_noise=use_noise,
additional_inputs=additional_inputs)
grads = TT.grad(cost, self.params)
if self.additional_gradients:
for new_grads, to_replace, properties in self.additional_gradients:
gparams, params = new_grads
prop_expr = [x[1] for x in properties]
replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace]
rval = theano.clone(gparams + prop_expr,
replace=replace)
gparams = rval[:len(gparams)]
prop_expr = rval[len(gparams):]
self.properties += [(x[0], y) for x,y in zip(properties,
prop_expr)]
for gp, p in zip(gparams, params):
grads[self.params.index(p)] += gp
self.cost = cost
self.grads = grads
def Gvs_fn(*args):
w = (1 - self.model_output) * self.model_output * state_below.shape[1]
Gvs = TT.Lop(self.model_output, self.params,
TT.Rop(self.model_output, self.params, args)/w)
return Gvs
self.Gvs = Gvs_fn
return cost, grads
def get_grads(self, state_below, target, mask = None, reg = None,
scale=None, sum_over_time=True, use_noise=True,
additional_inputs=None):
"""
This function implements both the forward and backwards pass of this
layer. The reason we do this in a single function is because for the
factorized softmax layer is hard to rely on grad and get an
optimized graph. For uniformity I've implemented this method for
this layer as well (though one doesn't need to use it)
:param state_below: theano variable representing the input to the
softmax layer
:param target: theano variable representing the target for this
layer
:return: cost, dC_dstate_below, param_grads, new_properties
dC_dstate_below is a computational graph representing the
gradient of the cost wrt to state_below
param_grads is a list containing the gradients wrt to the
different parameters of the layer
new_properties is a dictionary containing additional properties
of the model; properties are theano expression that are
evaluated and reported by the model
"""
cost = self.get_cost(state_below,
target,
mask = mask,
reg = reg,
scale=scale,
sum_over_time=sum_over_time,
use_noise=use_noise,
additional_inputs=additional_inputs)
grads = TT.grad(cost, self.params)
if self.additional_gradients:
for new_grads, to_replace, properties in self.additional_gradients:
gparams, params = new_grads
prop_expr = [x[1] for x in properties]
replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace]
rval = theano.clone(gparams + prop_expr,
replace=replace)
gparams = rval[:len(gparams)]
prop_expr = rval[len(gparams):]
self.properties += [(x[0], y) for x,y in zip(properties,
prop_expr)]
for gp, p in zip(gparams, params):
grads[self.params.index(p)] += gp
self.cost = cost
self.grads = grads
def Gvs_fn(*args):
w = (1 - self.model_output) * self.model_output * state_below.shape[1]
Gvs = TT.Lop(self.model_output, self.params,
TT.Rop(self.model_output, self.params, args)/w)
return Gvs
self.Gvs = Gvs_fn
return cost, grads
def get_output_for(self, input, deterministic=False, **kwargs):
input_mean = input.mean(self.axes)
input_var = input.var(self.axes)
# Decide whether to use the stored averages or mini-batch statistics
use_averages = kwargs.get('batch_norm_use_averages',
deterministic)
if use_averages:
mean = self.mean
var = self.var
else:
mean = input_mean
var = input_var
# Decide whether to update the stored averages
update_averages = kwargs.get('batch_norm_update_averages',
not deterministic)
if update_averages:
# Trick: To update the stored statistics, we create memory-aliased
# clones of the stored statistics:
running_mean = theano.clone(self.mean, share_inputs=False)
running_var = theano.clone(self.var, share_inputs=False)
# set a default update for them:
running_mean.default_update = ((1 - self.alpha) * running_mean +
self.alpha * input_mean)
running_var.default_update = ((1 - self.alpha) * running_var +
self.alpha * input_var)
# and make sure they end up in the graph without participating in
# the computation (this way their default_update will be collected
# and applied, but the computation will be optimized away):
mean += 0 * running_mean
var += 0 * running_var
# prepare dimshuffle pattern inserting broadcastable axes as needed
param_axes = iter(range(self.beta.ndim))
pattern = ['x' if input_axis in self.axes
else next(param_axes)
for input_axis in range(input.ndim)]
# apply dimshuffle pattern to all parameters
beta = self.beta.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
mean = mean.dimshuffle(pattern)
std = T.sqrt(var + self.epsilon)
std = std.dimshuffle(pattern)
# normalize
# normalized = (input - mean) * (gamma / std) + beta
normalized = T.nnet.batch_normalization(input, gamma=gamma, beta=beta,
mean=mean, std=std,
mode=self.mode)
return self.nonlinearity(normalized)
def get_output_for(self, input, deterministic=False, **kwargs):
input_mean = input.mean(self.axes)
input_std = TT.sqrt(input.var(self.axes) + self.epsilon)
# Decide whether to use the stored averages or mini-batch statistics
use_averages = kwargs.get('batch_norm_use_averages',
deterministic)
if use_averages:
mean = self.mean
std = self.std
else:
mean = input_mean
std = input_std
# Decide whether to update the stored averages
update_averages = kwargs.get('batch_norm_update_averages',
not deterministic)
if update_averages:
# Trick: To update the stored statistics, we create memory-aliased
# clones of the stored statistics:
running_mean = theano.clone(self.mean, share_inputs=False)
running_std = theano.clone(self.std, share_inputs=False)
# set a default update for them:
running_mean.default_update = ((1 - self.alpha) * running_mean +
self.alpha * input_mean)
running_std.default_update = ((1 - self.alpha) *
running_std +
self.alpha * input_std)
# and make sure they end up in the graph without participating in
# the computation (this way their default_update will be collected
# and applied, but the computation will be optimized away):
mean += 0 * running_mean
std += 0 * running_std
# prepare dimshuffle pattern inserting broadcastable axes as needed
param_axes = iter(list(range(input.ndim - len(self.axes))))
pattern = ['x' if input_axis in self.axes
else next(param_axes)
for input_axis in range(input.ndim)]
# apply dimshuffle pattern to all parameters
beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
mean = mean.dimshuffle(pattern)
std = std.dimshuffle(pattern)
# normalize
normalized = (input - mean) * (gamma * TT.inv(std)) + beta
return normalized
def get_output_for(self, input, deterministic=False, **kwargs):
input_mean = input.mean(self.axes)
input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon))
# Decide whether to use the stored averages or mini-batch statistics
use_averages = kwargs.get('batch_norm_use_averages',
deterministic)
if use_averages:
mean = self.mean
inv_std = self.inv_std
else:
mean = input_mean
inv_std = input_inv_std
# Decide whether to update the stored averages
update_averages = kwargs.get('batch_norm_update_averages',
not deterministic)
if update_averages:
# Trick: To update the stored statistics, we create memory-aliased
# clones of the stored statistics:
running_mean = theano.clone(self.mean, share_inputs=False)
running_inv_std = theano.clone(self.inv_std, share_inputs=False)
# set a default update for them:
running_mean.default_update = ((1 - self.alpha) * running_mean +
self.alpha * input_mean)
running_inv_std.default_update = ((1 - self.alpha) *
running_inv_std +
self.alpha * input_inv_std)
# and make sure they end up in the graph without participating in
# the computation (this way their default_update will be collected
# and applied, but the computation will be optimized away):
mean += 0 * running_mean
inv_std += 0 * running_inv_std
# prepare dimshuffle pattern inserting broadcastable axes as needed
param_axes = iter(range(input.ndim - len(self.axes)))
pattern = ['x' if input_axis in self.axes
else next(param_axes)
for input_axis in range(input.ndim)]
# apply dimshuffle pattern to all parameters
beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
mean = mean.dimshuffle(pattern)
inv_std = inv_std.dimshuffle(pattern)
# normalize
normalized = (input - mean) * (gamma * inv_std) + beta
return normalized
def call(self, x, mask=None):
input_dim = self.input_dim
input_type='real'
out_every_t=False
loss_function='MSE'
output_type='real'
flag_feed_forward=False
flag_use_mask=False
hidden_bias_mean=np.float32(0.0)
hidden_bias_init='zero'
Wimpl=self.unitary_impl
if ('full' in Wimpl):
Wimpl='full'
elif (Wimpl=='ASB2016'):
Wimpl='adhoc'
#hidden_bias_init='rand'
elif (Wimpl=='ASB2016_fast'):
Wimpl='adhoc_fast'
n_layers=1
seed=1234
x_spec=K.permute_dimensions(x,(1,0,2))
inputs, parameters, costs = models.complex_RNN(input_dim, self.hidden_dim, self.output_dim, input_type=input_type,out_every_t=out_every_t, loss_function=loss_function,output_type=output_type,flag_feed_forward=flag_feed_forward,flag_return_lin_output=True,x_spec=x_spec,flag_use_mask=flag_use_mask,hidden_bias_mean=hidden_bias_mean,Wimpl=Wimpl,flag_return_hidden_states=True,n_layers=n_layers,seed=seed,hidden_bias_init=hidden_bias_init)
lin_output=costs[2]
#self.hidden_states=costs[3]
if (self.unitary_impl=='full'):
# just use lrng for learning rate on this parameter
parameters[-1].name+='full_natGrad'
elif (self.unitary_impl=='full_natGrad'):
# use fixed lrng with natural gradient update
parameters[-1].name+='_natGrad_unitaryAug'
elif (self.unitary_impl=='full_natGradRMS'):
# use fixed lrng with natural gradient update and RMSprop-style gradient adjustment
parameters[-1].name+='_natGradRMS_unitaryAug'
elif (self.unitary_impl=='full_enforceComplex'):
# swap out 2Nx2N augmented unitary matrix for Nx2N, which ensures the
# complex number constraint is satisfied
parameters[-1].name+='full_natGrad'
Waug=parameters[-1]
WReIm=K.variable(value=Waug[:Waug.shape[1]/2,:].eval(),name=Waug.name)
WaugFull=K.concatenate( (WReIm, K.concatenate((-WReIm[:,WReIm.shape[1]/2:],WReIm[:,:WReIm.shape[1]/2]),axis=1)),axis=0 )
lin_output_new = theano.clone(lin_output,replace={parameters[-1]:WaugFull})
lin_output = lin_output_new
parameters[-1]=WReIm
self.trainable_weights = parameters
return lin_output