def switch(condition, then_tensor, else_tensor):
"""
Keras' implementation of switch for tensorflow uses tf.switch which accepts only scalar conditions.
It should use tf.select instead.
"""
if K.backend() == 'tensorflow':
import tensorflow as tf
condition_shape = condition.get_shape()
input_shape = then_tensor.get_shape()
if condition_shape[-1] != input_shape[-1] and condition_shape[-1] == 1:
# This means the last dim is an embedding dim. Keras does not mask this dimension. But tf wants
# the condition and the then and else tensors to be the same shape.
condition = K.dot(tf.cast(condition, tf.float32), tf.ones((1, input_shape[-1])))
return tf.select(tf.cast(condition, dtype=tf.bool), then_tensor, else_tensor)
else:
import theano.tensor as T
return T.switch(condition, then_tensor, else_tensor)
python类tensor()的实例源码
def dropout_layer(state_before, use_noise, trng):
"""
:todo:
- Fix according to _param
- Test!
From Cho's code here:
https://github.com/nyu-dl/dl4mt-tutorial/blob/master/session2/nmt.py#L45
"""
proj = tensor.switch(
use_noise,
# for training
state_before * trng.binomial(state_before.shape, p=0.5, n=1,
dtype=state_before.dtype),
# for validation/sampling
state_before * 0.5)
return proj
def build_encoder_bi(tparams, options):
"""
build bidirectional encoder, given pre-computed word embeddings
"""
# word embedding (source)
embedding = tensor.tensor3('embedding', dtype='float32')
embeddingr = embedding[::-1]
x_mask = tensor.matrix('x_mask', dtype='float32')
xr_mask = x_mask[::-1]
# encoder
proj = get_layer(options['encoder'])[1](tparams, embedding, options,
prefix='encoder',
mask=x_mask)
projr = get_layer(options['encoder'])[1](tparams, embeddingr, options,
prefix='encoder_r',
mask=xr_mask)
ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1)
return embedding, x_mask, ctx
# some utilities
def _step(self, x_, m_, h_, c_):
preact = tensor.dot(h_, self.U) + x_
i = tensor.nnet.sigmoid(_slice(preact, 0, self.hidden_dim))
f = tensor.nnet.sigmoid(_slice(preact, 1, self.hidden_dim) + self.forget_bias)
o = tensor.nnet.sigmoid(_slice(preact, 2, self.hidden_dim))
j = tensor.tanh(_slice(preact, 3, self.hidden_dim))
c = f * c_ + i * j
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * tensor.tanh(c)
if self.recurrent_dropout_layer != None:
h = self.recurrent_dropout_layer.connect(h, self.is_train)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
return h, c
def connect(self, inputs, mask, is_train):
""" is_train: A boolean tensor.
"""
max_length = inputs.shape[0]
batch_size = inputs.shape[1]
outputs_info = [tensor.alloc(numpy_floatX(0.), batch_size, self.hidden_dim),
tensor.alloc(numpy_floatX(0.), batch_size, self.hidden_dim)]
# Dropout mask sharing for variational dropout.
self.is_train = is_train
if self.recurrent_dropout_layer != None:
self.recurrent_dropout_layer.generate_mask([batch_size, self.hidden_dim], is_train)
inputs = tensor.dot(inputs, self.W) + self.b
rval, _ = theano.scan(self._step, # Scan function
sequences=[inputs, mask], # Input sequence
outputs_info=outputs_info,
name=_p(self.prefix, '_layers'),
n_steps=max_length) # scan steps
return rval[0]
def _step(self, x_, px_, m_, h_, c_):
preact = tensor.dot(h_, self.U) + px_
# i: input. f: forget. o: output. t: transform.
# j: input w\ non-linearity. k: input w\o non-linearity.
i = tensor.nnet.sigmoid(_slice(preact, 0, self.hidden_dim))
f = tensor.nnet.sigmoid(_slice(preact, 1, self.hidden_dim) + self.forget_bias)
o = tensor.nnet.sigmoid(_slice(preact, 2, self.hidden_dim))
t = tensor.nnet.sigmoid(_slice(preact, 3, self.hidden_dim))
j = tensor.tanh(_slice(preact, 4, self.hidden_dim))
c = f * c_ + i * j
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = t * o * tensor.tanh(c) + (1. - t) * x_
if self.recurrent_dropout_layer != None:
h = self.recurrent_dropout_layer.connect(h, self.is_train)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
return h, c
def connect(self, inputs, mask, is_train):
max_length = inputs.shape[0]
batch_size = inputs.shape[1]
outputs_info = [tensor.alloc(numpy_floatX(0.), batch_size, self.hidden_dim),
tensor.alloc(numpy_floatX(0.), batch_size, self.hidden_dim)]
# Dropout layers
self.is_train = is_train
if self.recurrent_dropout_layer != None:
self.recurrent_dropout_layer.generate_mask([batch_size, self.hidden_dim], is_train)
proj_inputs = tensor.dot(inputs, self.W) + self.b
rval, _ = theano.scan(self._step, # Scan function
sequences=[inputs, proj_inputs, mask], # Input sequence
outputs_info=outputs_info,
name=_p(self.prefix, '_layers'),
n_steps=max_length) # scan steps
return rval[0]
def _step(self, x_, px_, m_, h_, c_):
preact = tensor.dot(h_, self.U) + px_
i = tensor.nnet.sigmoid(_slice(preact, 0, self.hidden_dim))
f = tensor.nnet.sigmoid(_slice(preact, 1, self.hidden_dim) + self.forget_bias)
o = tensor.nnet.sigmoid(_slice(preact, 2, self.hidden_dim))
j = tensor.tanh(_slice(preact, 3, self.hidden_dim))
c = f * c_ + i * j
c = m_[:, None] * c + (1. - m_)[:, None] * c_
# Residual connection.
h = o * tensor.tanh(c) + x_
if self.recurrent_dropout_layer != None:
h = self.recurrent_dropout_layer.connect(h, self.is_train)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
return h, c
def adadelta(parameters, gradients, rho=0.95, eps=1e-6):
""" Reference: ADADELTA: An Adaptive Learning Rate Method,
Zeiler 2012. https://arxiv.org/abs/1212.5701
Adapted from the Adadelta implementation from Tensorflow.
"""
accum = [theano.shared(numpy.zeros(p.get_value().shape, floatX)) for p in parameters]
accum_updates = [theano.shared(numpy.zeros(p.get_value().shape, floatX)) for p in parameters]
new_accum = [rho * g0 + (1.0 - rho) * (g**2) for g0, g in izip(accum, gradients)]
updates = [tensor.sqrt(d0 + eps) / tensor.sqrt(g0 + eps) * g for d0, g0, g in izip(accum_updates,
new_accum,
gradients)]
new_accum_updates = [rho * d0 + (1.0 - rho) * (d**2) for d0, d in izip(accum_updates,
updates)]
accum_ = zip(accum, new_accum)
accum_updates_ = zip(accum_updates, new_accum_updates)
parameters_ = [ (p, (p - d)) for p,d in izip(parameters, updates)]
return accum_ + accum_updates_ + parameters_
def build_encoder_bi(tparams, options):
"""
build bidirectional encoder, given pre-computed word embeddings
"""
# word embedding (source)
embedding = tensor.tensor3('embedding', dtype='float32')
embeddingr = embedding[::-1]
x_mask = tensor.matrix('x_mask', dtype='float32')
xr_mask = x_mask[::-1]
# encoder
proj = get_layer(options['encoder'])[1](tparams, embedding, options,
prefix='encoder',
mask=x_mask)
projr = get_layer(options['encoder'])[1](tparams, embeddingr, options,
prefix='encoder_r',
mask=xr_mask)
ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1)
return embedding, x_mask, ctx
# some utilities
def shared_dropout_layer(shape, use_noise, trng, value, scaled=True):
#re-scale dropout at training time, so we don't need to at test time
if scaled:
proj = tensor.switch(
use_noise,
trng.binomial(shape, p=value, n=1,
dtype='float32')/value,
theano.shared(numpy.float32(1.)))
else:
proj = tensor.switch(
use_noise,
trng.binomial(shape, p=value, n=1,
dtype='float32'),
theano.shared(numpy.float32(value)))
return proj
# feedforward layer: affine transformation + point-wise nonlinearity
def mdclW(num_filters,num_channels,filter_size,winit,name,scales):
# Coefficient Initializer
sinit = lasagne.init.Constant(1.0/(1+len(scales)))
# Total filter size
size = filter_size + (filter_size-1)*(scales[-1]-1)
# Multiscale Dilated Filter
W = T.zeros((num_filters,num_channels,size,size))
# Undilated Base Filter
baseW = theano.shared(lasagne.utils.floatX(winit.sample((num_filters,num_channels,filter_size,filter_size))),name=name+'.W')
for scale in enumerate(scales[::-1]): # enumerate backwards so that we place the main filter on top
W = T.set_subtensor(W[:,:,scales[-1]-scale:size-scales[-1]+scale:scale,scales[-1]-scale:size-scales[-1]+scale:scale],
baseW*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'.coeff_'+str(scale)).dimshuffle(0,'x','x','x'))
return W
# Subpixel Upsample Layer from (https://arxiv.org/abs/1609.05158)
# This layer uses a set of r^2 set_subtensor calls to reorganize the tensor in a subpixel-layer upscaling style
# as done in the ESPCN Magic ony paper for super-resolution.
# r is the upscale factor.
# c is the number of output channels.
def errors(self, y):
"""Return a float representing the number of errors in the minibatch
over the total number of examples of the minibatch ; zero one
loss over the size of the minibatch
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example the
correct label
"""
# check if y has same dimension of y_pred
if y.ndim != self.y_pred.ndim:
raise TypeError(
'y should have the same shape as self.y_pred',
('y', y.type, 'y_pred', self.y_pred.type)
)
# check if y is of the correct datatype
if y.dtype.startswith('int'):
# the T.neq operator returns a vector of 0s and 1s, where 1
# represents a mistake in prediction
return T.mean(T.neq(self.y_pred, y))
else:
raise NotImplementedError()
def errors(self, y):
"""Return a float representing the number of errors in the minibatch
over the total number of examples of the minibatch ; zero one
loss over the size of the minibatch
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example the
correct label
"""
# check if y has same dimension of y_pred
if y.ndim != self.y_pred.ndim:
raise TypeError(
'y should have the same shape as self.y_pred',
('y', y.type, 'y_pred', self.y_pred.type)
)
# check if y is of the correct datatype
if y.dtype.startswith('int'):
# the T.neq operator returns a vector of 0s and 1s, where 1
# represents a mistake in prediction
return T.mean(T.neq(self.y_pred, y))
else:
raise NotImplementedError()
def errors(self, y):
"""Return a float representing the number of errors in the minibatch
over the total number of examples of the minibatch ; zero one
loss over the size of the minibatch
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example the
correct label
"""
# check if y has same dimension of y_pred
if y.ndim != self.y_pred.ndim:
raise TypeError('y should have the same shape as self.y_pred',
('y', target.type, 'y_pred', self.y_pred.type))
# check if y is of the correct datatype
if y.dtype.startswith('int'):
# the T.neq operator returns a vector of 0s and 1s, where 1
# represents a mistake in prediction
return T.mean(T.neq(self.y_pred, y))
else:
raise NotImplementedError()
def applyActivationFunction_LeakyReLU( inputData, leakiness ) :
"""leakiness : float
Slope for negative input, usually between 0 and 1.
A leakiness of 0 will lead to the standard rectifier,
a leakiness of 1 will lead to a linear activation function,
and any value in between will give a leaky rectifier.
[1] Maas et al. (2013):
Rectifier Nonlinearities Improve Neural Network Acoustic Models,
http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf
- The input is a tensor of shape (batchSize, FeatMaps, xDim, yDim, zDim) """
pos = 0.5 * (1 + leakiness)
neg = 0.5 * (1 - leakiness)
output = pos * inputData + neg * abs(inputData)
return (output)
# *** There actually exist several ways to implement PReLU activations ***
# PReLU activations (from Kamnitsas)
def applyActivationFunction_PReLU( inputData, PreluActivations ) :
"""Parametric Rectified Linear Unit.
It follows:
`f(x) = alpha * x for x < 0`,
`f(x) = x for x >= 0`,
where `alpha` is a learned array with the same shape as x.
- The input is a tensor of shape (batchSize, FeatMaps, xDim, yDim, zDim) """
preluActivationsAsRow = PreluActivations.dimshuffle('x', 0, 'x', 'x', 'x')
pos = T.maximum(0, inputData)
neg = preluActivationsAsRow * (inputData - abs(inputData)) * 0.5
output = pos + neg
return (output)
# --- version 2 ---
def applyActivationFunction_PReLU_v2(inputData,PreluActivations) :
""" inputData is a tensor5D with shape:
(batchSize,
Number of feature Maps,
convolvedImageShape[0],
convolvedImageShape[1],
convolvedImageShape[2]) """
# The input is a tensor of shape (batchSize, FeatMaps, xDim, yDim, zDim)
preluActivationsAsRow = PreluActivations.dimshuffle('x', 0, 'x', 'x', 'x')
pos = ((inputData + abs(inputData)) / 2.0 )
neg = preluActivationsAsRow * ((inputData - abs(inputData)) / 2.0 )
output = pos + neg
return ( output)
# --- version 3 ---
def applyActivationFunction_PReLU_v3(inputData,PreluActivations) :
""" inputData is a tensor5D with shape:
(batchSize,
Number of feature Maps,
convolvedImageShape[0],
convolvedImageShape[1],
convolvedImageShape[2]) """
# The input is a tensor of shape (batchSize, FeatMaps, xDim, yDim, zDim)
preluActivationsAsRow = PreluActivations.dimshuffle('x', 0, 'x', 'x', 'x')
pos = 0.5 * (1 + preluActivationsAsRow )
neg = 0.5 * (1 - preluActivationsAsRow )
output = pos * inputData + neg * abs(inputData)
return ( output)
# Benchmark on ReLU/PReLU activations:
# http://gforge.se/2015/06/benchmarking-relu-and-prelu/
# TODO. Implement some other activation functions:
# Ex: Randomized ReLU
# S-shape Relu
# ThresholdedReLU
def errors(self, y):
"""Return a float representing the number of errors in the minibatch
over the total number of examples of the minibatch ; zero one
loss over the size of the minibatch
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example the
correct label
"""
# check if y has same dimension of y_pred
if y.ndim != self.y_pred.ndim:
raise TypeError(
'y should have the same shape as self.y_pred',
('y', y.type, 'y_pred', self.y_pred.type)
)
# check if y is of the correct datatype
if y.dtype.startswith('int'):
# the T.neq operator returns a vector of 0s and 1s, where 1
# represents a mistake in prediction
return T.mean(T.neq(self.y_pred, y))
else:
return T.sum((y - self.y_pred) ** 2);
def wrapped_conv(*args, **kwargs):
copy = dict(kwargs)
copy.pop("image_shape", None)
copy.pop("filter_shape", None)
assert copy.pop("filter_flip", False)
input, W, input_shape, get_W_shape = args
if theano.config.device == 'cpu':
return theano.tensor.nnet.conv2d(*args, **kwargs)
try:
return theano.sandbox.cuda.dnn.dnn_conv(
input.astype('float32'),
W.astype('float32'),
**copy
)
except Exception as e:
print("falling back to default conv2d")
return theano.tensor.nnet.conv2d(*args, **kwargs)
def dropout(state_before, is_train, trng):
"""
dropout with p=0.5
Parameters
----------
state_before : theano 3d tensor, input data, dimensions: (num of time steps, batch size, dim of vector)
is_train : theano shared scalar, 0. = test/valid, 1. = train,
trng : random number generator
Returns
-------
proj : theano 3d tensor, output data, dimensions: (num of time steps, batch size, dim of vector)
"""
proj = tensor.switch(is_train,
state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype),
state_before * 0.5)
return proj
def fullyconnected_layer(tparams, state_below, options, prefix, activ='lambda x: x', **kwargs):
"""
compute the forward pass for a fully connected layer
Parameters
----------
tparams : OrderedDict of theano shared variables, {parameter name: value}
state_below : theano 3d tensor, input data, dimensions: (num of time steps, batch size, dim of vector)
options : dictionary, {hyperparameter: value}
prefix : string, layer name
activ : string, activation function: 'liner', 'tanh', or 'rectifier'
Returns
-------
: theano 3d tensor, output data, dimensions: (num of time steps, batch size, dim of vector)
"""
return eval(activ)(tensor.dot(state_below, tparams[p_name(prefix, 'W')]) + tparams[p_name(prefix, 'b')])
def gate_layer(tparams, X_word, X_char, options, prefix, pretrain_mode, activ='lambda x: x', **kwargs):
"""
compute the forward pass for a gate layer
Parameters
----------
tparams : OrderedDict of theano shared variables, {parameter name: value}
X_word : theano 3d tensor, word input, dimensions: (num of time steps, batch size, dim of vector)
X_char : theano 3d tensor, char input, dimensions: (num of time steps, batch size, dim of vector)
options : dictionary, {hyperparameter: value}
prefix : string, layer name
pretrain_mode : theano shared scalar, 0. = word only, 1. = char only, 2. = word & char
activ : string, activation function: 'liner', 'tanh', or 'rectifier'
Returns
-------
X : theano 3d tensor, final vector, dimensions: (num of time steps, batch size, dim of vector)
"""
# compute gating values, Eq.(3)
G = tensor.nnet.sigmoid(tensor.dot(X_word, tparams[p_name(prefix, 'v')]) + tparams[p_name(prefix, 'b')][0])
X = ifelse(tensor.le(pretrain_mode, numpy.float32(1.)),
ifelse(tensor.eq(pretrain_mode, numpy.float32(0.)), X_word, X_char),
G[:, :, None] * X_char + (1. - G)[:, :, None] * X_word)
return eval(activ)(X)
def theano_logsumexp(x, axis=None):
"""
Compute log(sum(exp(x), axis=axis) in a numerically stable
fashion.
Parameters
----------
x : tensor_like
A Theano tensor (any dimension will do).
axis : int or symbolic integer scalar, or None
Axis over which to perform the summation. `None`, the
default, performs over all axes.
Returns
-------
result : ndarray or scalar
The result of the log(sum(exp(...))) operation.
"""
xmax = x.max(axis=axis, keepdims=True)
xmax_ = x.max(axis=axis)
return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis))
def compute_sample(self, state_below, temp=1, use_noise=False):
"""
Constructs the theano expression that samples from the output layer.
:type state_below: tensor or layer
:param state_below: The theano expression (or groundhog layer)
representing the input of the cost layer
:type temp: float or tensor scalar
:param temp: scalar representing the temperature that should be used
when sampling from the output distribution
:type use_noise: bool
:param use_noise: flag. If true, noise is used when computing the
output of the model
"""
raise NotImplemented
def compute_sample(self, state_below, temp=1, use_noise=False):
"""
Constructs the theano expression that samples from the output layer.
:type state_below: tensor or layer
:param state_below: The theano expression (or groundhog layer)
representing the input of the cost layer
:type temp: float or tensor scalar
:param temp: scalar representing the temperature that should be used
when sampling from the output distribution
:type use_noise: bool
:param use_noise: flag. If true, noise is used when computing the
output of the model
"""
raise NotImplemented
def __init__(self, incoming, num_units, max_steps, peepholes=False, mask_input=None, **kwargs):
"""
initialization
:param incoming: bidirectional mLSTM for passane
:param num_units:
:param max_steps: max num steps to generate answer words, can be tensor scalar variable
:param peepholes:
:param mask_input: passage's length mask
:param kwargs:
"""
super(AnsPointerLayer, self).__init__(incoming, num_units, peepholes=peepholes,
precompute_input=False, mask_input=mask_input,
only_return_final=False, **kwargs)
self.max_steps = max_steps
# initializes attention weights
input_shape = self.input_shapes[0]
num_inputs = np.prod(input_shape[2:])
self.V_pointer = self.add_param(init.Normal(0.1), (num_inputs, num_units), 'V_pointer')
# doesn't need transpose
self.v_pointer = self.add_param(init.Normal(0.1), (num_units, 1), 'v_pointer')
self.W_a_pointer = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_a_pointer')
self.b_a_pointer = self.add_param(init.Constant(0.), (1, num_units), 'b_a_pointer')
self.c_pointer = self.add_param(init.Constant(0.), (1, 1), 'c_pointer')
def __init__(self, incoming, num_units, max_steps, peepholes=False, mask_input=None, **kwargs):
"""
initialization
:param incoming: bidirectional mLSTM for passane
:param num_units:
:param max_steps: max num steps to generate answer words, can be tensor scalar variable
:param peepholes:
:param mask_input: passage's length mask
:param kwargs:
"""
super(AnsPointerLayer, self).__init__(incoming, num_units, peepholes=peepholes,
precompute_input=False, mask_input=mask_input,
only_return_final=False, **kwargs)
self.max_steps = max_steps
# initializes attention weights
input_shape = self.input_shapes[0]
num_inputs = np.prod(input_shape[2:])
self.V_pointer = self.add_param(init.Normal(0.1), (num_inputs, num_units), 'V_pointer')
# doesn't need transpose
self.v_pointer = self.add_param(init.Normal(0.1), (num_units, 1), 'v_pointer')
self.W_a_pointer = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_a_pointer')
self.b_a_pointer = self.add_param(init.Constant(0.), (num_units, ), 'b_a_pointer')
c_pointer = theano.shared(np.array([0.], dtype='float32'), name='c_pointer', broadcastable=(True, ))
self.c_pointer = self.add_param(c_pointer, (1,), 'c_pointer')
def __init__(self, filter_size=(3,3),
input_feature=None, output_feature=None,
feature_map_multiplier=1,
subsample=(1,1), border='half', need_bias=False, dc=0.0):
"""
This 2d convolution deals with 4d tensor:
(batch_size, feature map/channel, filter_row, filter_col)
feature_map_multiplier always has a ligher priority
than input_feature/output_feature
"""
super(Conv2d, self).__init__()
self.filterSize = filter_size
self.inputFeature = input_feature
self.outputFeature = output_feature
self.mapMulti = feature_map_multiplier
self.border = border
self.subsample = subsample
self.need_bias = need_bias
self.dc = dc
self.w = None
self.b = None