def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
python类cast()的实例源码
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def maxout2(x):
shape = x.shape
if x.ndim == 1:
shape1 = T.cast(shape[0] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape1, shape2])
x = x.max(1)
elif x.ndim == 2:
shape1 = T.cast(shape[1] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape[0], shape1, shape2])
x = x.max(2)
elif x.ndim == 3:
shape1 = T.cast(shape[2] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape[0], shape[1], shape1, shape2])
x = x.max(3)
return x
def dice_loss(y_pred, y_true, void_class, class_for_dice=1):
'''
Dice loss -- works for only binary classes.
y_pred is a softmax output
y_true is one hot
'''
smooth = 1
y_true_f = T.flatten(y_true[:, class_for_dice, :, :])
y_true_f = T.cast(y_true_f, 'int32')
y_pred_f = T.flatten(y_pred[:, class_for_dice, :, :])
# remove void classes from dice
if len(void_class):
for i in range(len(void_class)):
# get idx of non void classes and remove void classes
# from y_true and y_pred
idxs = T.neq(y_true_f, void_class[i]).nonzero()
y_pred_f = y_pred_f[idxs]
y_true_f = y_true_f[idxs]
intersection = T.sum(y_true_f * y_pred_f)
return -(2.*intersection+smooth) / (T.sum(y_true_f)+T.sum(y_pred_f)+smooth)
def build_objective(model, deterministic=False, epsilon=1e-12):
predictions = nn.layers.get_output(model.l_out, deterministic=deterministic)
targets = T.cast(nn.layers.get_output(model.l_target), 'int32')
enable_targets = nn.layers.get_output(model.l_enable_target)
predictions = T.clip(predictions, epsilon, 1.-epsilon)
#is_nodule_ground_truth = T.cast(targets[:,0], 'float32')
sum_of_objectives = 0
unit_ptr = 0
for obj_idx, obj_name in enumerate(order_objectives):
n_classes = len(property_bin_borders[obj_name])
v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets)
if deterministic:
d_objectives_deterministic[obj_name] = T.mean(v_obj)
else:
d_objectives[obj_name] = T.mean(v_obj)
#sum_of_objectives += T.mean(enable_targets[obj_idx] * v_obj)
sum_of_objectives += T.mean(enable_targets[:,obj_idx] * v_obj)
unit_ptr = unit_ptr+n_classes
#print 'for debug purposes: unit_ptr', unit_ptr
return sum_of_objectives
def build_objective(model, deterministic=False, epsilon=1e-12):
predictions = nn.layers.get_output(model.l_out, deterministic=deterministic)
targets = T.cast(nn.layers.get_output(model.l_target), 'int32')
enable_targets = nn.layers.get_output(model.l_enable_target)
predictions = T.clip(predictions, epsilon, 1.-epsilon)
sum_of_objectives = 0
unit_ptr = 0
for obj_idx, obj_name in enumerate(order_objectives):
n_classes = len(property_bin_borders[obj_name])
v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets)
# take the mean of the objectives where it matters (enabled targets)
obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx]))
if deterministic:
d_objectives_deterministic[obj_name] = obj_scalar
else:
d_objectives[obj_name] = obj_scalar
sum_of_objectives += T.mean(v_obj)
unit_ptr = unit_ptr+n_classes
return sum_of_objectives
def build_objective(model, deterministic=False, epsilon=1e-12):
predictions = nn.layers.get_output(model.l_out, deterministic=deterministic)
targets = T.cast(nn.layers.get_output(model.l_target), 'int32')
enable_targets = nn.layers.get_output(model.l_enable_target)
predictions = T.clip(predictions, epsilon, 1.-epsilon)
sum_of_objectives = 0
unit_ptr = 0
for obj_idx, obj_name in enumerate(order_objectives):
n_classes = len(property_bin_borders[obj_name])
v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets)
# take the mean of the objectives where it matters (enabled targets)
obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx]))
if deterministic:
d_objectives_deterministic[obj_name] = obj_scalar
else:
d_objectives[obj_name] = obj_scalar
sum_of_objectives += obj_scalar
unit_ptr = unit_ptr+n_classes
return sum_of_objectives
def build_objective(model, deterministic=False, epsilon=1e-12):
predictions = nn.layers.get_output(model.l_out, deterministic=deterministic)
targets = T.cast(nn.layers.get_output(model.l_target), 'int32')
predictions = T.clip(predictions, epsilon, 1.-epsilon)
#is_nodule_ground_truth = T.cast(targets[:,0], 'float32')
sum_of_objectives = 0
unit_ptr = 0
for obj_idx, obj_name in enumerate(order_objectives):
n_classes = len(property_bin_borders[obj_name])
if deterministic:
d_objectives_deterministic[obj_name] = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets)
else:
d_objectives[obj_name] = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets)
sum_of_objectives += objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets)
unit_ptr = unit_ptr+n_classes
#print 'for debug purposes: unit_ptr', unit_ptr
return sum_of_objectives
def shared_dataset(self, data_xy, train=False, borrow=True):
"""Load the data to the shared variables of Theano.
Copy for once the data to the shared memory on the GPU.
"""
data_x, data_y = data_xy
if train:
dim_output = 10 # case of MNIST
data_y = np.int32(self.labels(data_y, dim_output))
shared_x = theano.shared(
np.asarray(data_x, dtype = theano.config.floatX),
borrow=borrow)
shared_y = theano.shared (
np.asarray(data_y, dtype = theano.config.floatX),
borrow=borrow)
return shared_x, T.cast(shared_y, 'int32')
def shared_dataset_xy(self, data_xy, nlabels = 10, train = False, task="cls", borrow=True):
"""Load the data to the shared variables of Theano.
Copy for once the data to the shared memory on the GPU.
"""
data_x, data_y = data_xy
if (train) and (task=='cls'):
data_y = np.int32(self.labels(data_y, nlabels))
shared_x = theano.shared(
np.asarray(data_x, dtype = theano.config.floatX),
borrow=borrow)
shared_y = theano.shared (
np.asarray(data_y, dtype = theano.config.floatX),
borrow=borrow)
return shared_x, T.cast(shared_y, 'int32')
def shared_dataset_x(data_x, borrow=True):
""" Function that loads the dataset into shared variables
The reason we store our dataset in shared variables is to allow
Theano to copy it into the GPU memory (when code is run on GPU).
Since copying data into the GPU is slow, copying a minibatch everytime
is needed (the default behaviour if the data is not in a shared
variable) would lead to a large decrease in performance.
"""
shared_x = theano.shared(numpy.asarray(data_x,
dtype=theano.config.floatX),
borrow=borrow)
# When storing data on the GPU it has to be stored as floats
# therefore we will store the labels as ``floatX`` as well
# (``shared_y`` does exactly that). But during our computations
# we need them as ints (we use labels as index, and if they are
# floats it doesn't make sense) therefore instead of returning
# ``shared_y`` we will have to cast it to int. This little hack
# lets ous get around this issue
return shared_x
def shared_dataset(data_x, data_y, borrow=True):
""" Function that loads the dataset into shared variables
The reason we store our dataset in shared variables is to allow
Theano to copy it into the GPU memory (when code is run on GPU).
Since copying data into the GPU is slow, copying a minibatch everytime
is needed (the default behaviour if the data is not in a shared
variable) would lead to a large decrease in performance.
"""
shared_x = theano.shared(numpy.asarray(data_x,
dtype=theano.config.floatX),
borrow=borrow)
shared_y = theano.shared(numpy.asarray(data_y,
dtype=theano.config.floatX),
borrow=borrow)
return shared_x, T.cast(shared_y, 'int32')
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev):
active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
active_next = T.cast(T.minimum(
T.maximum(
active + 1,
T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
), log_p_curr.shape[0]), 'int32')
common_factor = T.max(log_p_prev[:active])
p_prev = T.exp(log_p_prev[:active] - common_factor)
_p_prev = zeros[:active_next]
# copy over
_p_prev = T.set_subtensor(_p_prev[:active], p_prev)
# previous transitions
_p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
# skip transitions
_p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
updated_log_p_prev = T.log(_p_prev) + common_factor
log_p_next = T.set_subtensor(
zeros[:active_next],
log_p_curr[:active_next] + updated_log_p_prev
)
return active_next, log_p_next
def normalize_updates(old_mean, old_std, new_mean, new_std, old_W, old_b):
"""
Compute the updates for normalizing the last (linear) layer of a neural
network
"""
# Make necessary transformation so that
# (W_old * h + b_old) * std_old + mean_old == \
# (W_new * h + b_new) * std_new + mean_new
new_W = old_W * old_std[0] / (new_std[0] + 1e-6)
new_b = (old_b * old_std[0] + old_mean[0] - new_mean[0]) / (new_std[0] + 1e-6)
return OrderedDict([
(old_W, TT.cast(new_W, old_W.dtype)),
(old_b, TT.cast(new_b, old_b.dtype)),
(old_mean, new_mean),
(old_std, new_std),
])
def get_recon_loss(self, idxs, sent_output):
len_sent, len_doc_batch, n_d = sent_output.shape
recon_layer = self.recon_layer
padding_id = self.padding_id
dropout = self.dropout
# (len(sent)*len(doc)*batch)*n_e
input_flat = idxs.ravel()
true_recon = self.embedding_layer.recon_forward(input_flat)
sent_output = apply_dropout(sent_output, dropout)
pred_recon = recon_layer.forward(sent_output.reshape((len_sent*len_doc_batch, n_d)))
# (len(sent)*len(doc)*batch)
mask = T.cast(T.neq(input_flat, padding_id), theano.config.floatX)
n = T.sum(mask)
loss = T.sum((true_recon - pred_recon) ** 2, axis=1) * mask
loss = T.sum(loss) / n
return loss
def create_updates(self, input):
if self.mode == 0:
now_mean = T.mean(input, axis=0)
now_var = T.var(input, axis=0)
batch = T.cast(input.shape[0], theano.config.floatX)
else:
now_mean = T.mean(input, axis=(0,2,3))
now_var = T.var(input, axis=(0,2,3))
batch = T.cast(input.shape[0]*input.shape[2]*input.shape[3], theano.config.floatX)
if self.updates is None:
new_mean = self.momentum * self.mean + (1.0-self.momentum) * now_mean
new_var = self.momentum * self.var + (1.0-self.momentum) * ((batch+1.0)/batch*now_var)
else:
new_mean = self.momentum * self.updates[0][1] + (1.0-self.momentum) * now_mean
new_var = self.momentum * self.updates[1][1] + (1.0-self.momentum) * ((batch+1.0)/batch*now_var)
self.updates = [(self.mean, new_mean), (self.var, new_var)]
def maxout2(x):
shape = x.shape
if x.ndim == 1:
shape1 = T.cast(shape[0] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape1, shape2])
x = x.max(1)
elif x.ndim == 2:
shape1 = T.cast(shape[1] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape[0], shape1, shape2])
x = x.max(2)
elif x.ndim == 3:
shape1 = T.cast(shape[2] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape[0], shape[1], shape1, shape2])
x = x.max(3)
return x
def shared_dataset_x(data_x, borrow=True):
""" Function that loads the dataset into shared variables
The reason we store our dataset in shared variables is to allow
Theano to copy it into the GPU memory (when code is run on GPU).
Since copying data into the GPU is slow, copying a minibatch everytime
is needed (the default behaviour if the data is not in a shared
variable) would lead to a large decrease in performance.
"""
shared_x = theano.shared(numpy.asarray(data_x,
dtype=theano.config.floatX),
borrow=borrow)
# When storing data on the GPU it has to be stored as floats
# therefore we will store the labels as ``floatX`` as well
# (``shared_y`` does exactly that). But during our computations
# we need them as ints (we use labels as index, and if they are
# floats it doesn't make sense) therefore instead of returning
# ``shared_y`` we will have to cast it to int. This little hack
# lets ous get around this issue
return shared_x
def __init__(self, rng, input, dropout_rate=0.5):
"""
input: output of last layer
"""
self.input = input
self.dropout_rate = dropout_rate
srng = T.shared_randomstreams.RandomStreams(rng.randint(999999))
if self.dropout_rate > 0:
# p=1-p because 1's indicate keep and p is prob of dropping
mask = srng.binomial(n=1, p = 1-self.dropout_rate, size=self.input.shape)
# The cast is important because
# int * float32 = float64 which pulls things off the gpu
self.output = self.input * T.cast(mask, theano.config.floatX)
else:
self.output = input
def normalize_updates(old_mean, old_std, new_mean, new_std, old_W, old_b):
"""
Compute the updates for normalizing the last (linear) layer of a neural
network
"""
# Make necessary transformation so that
# (W_old * h + b_old) * std_old + mean_old == \
# (W_new * h + b_new) * std_new + mean_new
new_W = old_W * old_std[0] / (new_std[0] + 1e-6)
new_b = (old_b * old_std[0] + old_mean[0] - new_mean[0]) / (new_std[0] + 1e-6)
return OrderedDict([
(old_W, TT.cast(new_W, old_W.dtype)),
(old_b, TT.cast(new_b, old_b.dtype)),
(old_mean, new_mean),
(old_std, new_std),
])
def load_data(dataset):
"""????????????GPU????????"""
f = gzip.open(dataset, 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
def shared_dataset(data_xy, borrow=True):
data_x, data_y = data_xy
# ????????float????
shared_x = theano.shared(np.asarray(data_x, dtype=theano.config.floatX), borrow=borrow)
shared_y = theano.shared(np.asarray(data_y, dtype=theano.config.floatX), borrow=borrow)
# ????int????????????
return shared_x, T.cast(shared_y, 'int32')
test_set_x, test_set_y = shared_dataset(test_set)
valid_set_x, valid_set_y = shared_dataset(valid_set)
train_set_x, train_set_y = shared_dataset(train_set)
rval = [(train_set_x, train_set_y),
(valid_set_x, valid_set_y),
(test_set_x, test_set_y)]
return rval
def dual_copy_rounding(W,integer_bits=0,fractional_bits=1):
"""
Rounding as described in as in "Robustness of spiking Deep Belief Networks to noise and reduced bit precision
of neuro-inspired hardware platforms"
by Stromatidis et al. See http://dx.doi.org/10.3389/fnins.2015.00222
:param W: Weights
:param integer_bits: number of bits to represent the integer part
:param fractional_bits: number of bits to represent the fractional part
:return:quantized weights
"""
#print "Dual copy rounding!"
power = T.cast(2.**fractional_bits, theano.config.floatX) # float !
max_val = T.cast((2.**(fractional_bits+integer_bits))-1, theano.config.floatX)
value = W*power
value = GradPreserveRoundTensor(value) # rounding
value = T.clip(value, -max_val, max_val) # saturation arithmetic
Wb = value/power
return Wb
def __init__(self, input, n_in, n_out, prob_drop=0.5, verbose=False):
self.verbose = verbose
self.prob_drop = prob_drop
self.prob_keep = 1.0 - prob_drop
self.flag_on = theano.shared(np.cast[theano.config.floatX](1.0))
self.flag_off = 1.0 - self.flag_on
seed_this = DropoutLayer.seed_common.randint(0, 2**31-1)
mask_rng = theano.tensor.shared_randomstreams.RandomStreams(seed_this)
self.mask = mask_rng.binomial(n=1, p=self.prob_keep, size=input.shape)
self.output = \
self.flag_on * T.cast(self.mask, theano.config.floatX) * input + \
self.flag_off * self.prob_keep * input
DropoutLayer.layers.append(self)
if self.verbose:
print 'dropout layer with P_drop: ' + str(self.prob_drop)
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev):
active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
active_next = T.cast(T.minimum(
T.maximum(
active + 1,
T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
), log_p_curr.shape[0]), 'int32')
common_factor = T.max(log_p_prev[:active])
p_prev = T.exp(log_p_prev[:active] - common_factor)
_p_prev = zeros[:active_next]
# copy over
_p_prev = T.set_subtensor(_p_prev[:active], p_prev)
# previous transitions
_p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
# skip transitions
_p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
updated_log_p_prev = T.log(_p_prev) + common_factor
log_p_next = T.set_subtensor(
zeros[:active_next],
log_p_curr[:active_next] + updated_log_p_prev
)
return active_next, log_p_next
def load_data_shared(filename="mnist.pkl.gz"):
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
f.close()
def shared(data):
"""Place the data into shared variables. This allows Theano to copy
the data to the GPU, if one is available.
"""
shared_x = theano.shared(
np.asarray(data[0], dtype=theano.config.floatX), borrow=True)
shared_y = theano.shared(
np.asarray(data[1], dtype=theano.config.floatX), borrow=True)
return shared_x, T.cast(shared_y, "int32")
return [shared(training_data), shared(validation_data), shared(test_data)]
#### Main class used to construct and train networks
def to_measure(self, q) :
# Compute edges's vertices
a = q[self.connectivity[:,0]]
b = q[self.connectivity[:,1]]
c = q[self.connectivity[:,2]]
# A surface is represented as a sum of dirac, one for each triangle
x = .33333333 * (a + b + c) # Mean
# Cross product
ab = (b-a).dimshuffle(0, 1, 'x')
ac = (c-a).dimshuffle(0, 'x', 1)
t = (ab * ac).reshape((self.connectivity.shape[0], 9))
cp = t.dot( np.array( [
[0., 0., 0., 0., 0., 1., 0., -1., 0.],
[0., 0., -1., 0., 0., 0., 1., 0., 0.],
[0., 1., 0., -1., 0., 0., 0., 0., 0.]
]
).T)
mu = .5 * T.sqrt( (cp**2).sum(1) ) # Length
mu = T.cast(mu, dtype=config.floatX)
return (x, mu)
def to_varifold(self, q) :
# Compute edges's vertices
a = q[self.connectivity[:,0]]
b = q[self.connectivity[:,1]]
c = q[self.connectivity[:,2]]
# A surface is represented as a sum of dirac, one for each triangle
x = .33333333 * (a + b + c) # Mean
# Cross product
ab = (b-a).dimshuffle(0, 1, 'x')
ac = (c-a).dimshuffle(0, 'x', 1)
t = (ab * ac).reshape((self.connectivity.shape[0], 9))
cp = t.dot( np.array( [
[0., 0., 0., 0., 0., 1., 0., -1., 0.],
[0., 0., -1., 0., 0., 0., 1., 0., 0.],
[0., 1., 0., -1., 0., 0., 0., 0., 0.]
]
).T)
mu = T.sqrt( (cp**2).sum(1) ) # Length
u = ( cp / mu.dimshuffle(0,'x')) # Normal direction
mu = T.cast(.5*mu, dtype=config.floatX)
u = T.cast(u, dtype=config.floatX)
return (x, mu, u)
def load_data(dataset):
if dataset.split('.')[-1] == 'gz':
f = gzip.open(dataset, 'r')
else:
f = open(dataset, 'r')
train_set, valid_set, test_set = pkl.load(f)
f.close()
def shared_dataset(data_xy, borrow=True):
data_x, data_y = data_xy
shared_x = theano.shared(
np.asarray(data_x, dtype=theano.config.floatX),
borrow=borrow)
shared_y = theano.shared(
np.asarray(data_y, dtype=theano.config.floatX),
borrow=borrow)
return shared_x, T.cast(shared_y, 'int32')
train_set_x, train_set_y = shared_dataset(train_set)
valid_set_x, valid_set_y = shared_dataset(valid_set)
test_set_x, test_set_y = shared_dataset(test_set)
return [(train_set_x, train_set_y),
(valid_set_x, valid_set_y),
(test_set_x, test_set_y )]
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)