def __init__(
self,
env_spec,
hidden_sizes=(32, 32),
hidden_nonlinearity=NL.rectify,
hidden_W_init=LI.HeUniform(),
hidden_b_init=LI.Constant(0.),
output_nonlinearity=NL.tanh,
output_W_init=LI.Uniform(-3e-3, 3e-3),
output_b_init=LI.Uniform(-3e-3, 3e-3),
bn=False):
Serializable.quick_init(self, locals())
l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim))
l_hidden = l_obs
if bn:
l_hidden = batch_norm(l_hidden)
for idx, size in enumerate(hidden_sizes):
l_hidden = L.DenseLayer(
l_hidden,
num_units=size,
W=hidden_W_init,
b=hidden_b_init,
nonlinearity=hidden_nonlinearity,
name="h%d" % idx
)
if bn:
l_hidden = batch_norm(l_hidden)
l_output = L.DenseLayer(
l_hidden,
num_units=env_spec.action_space.flat_dim,
W=output_W_init,
b=output_b_init,
nonlinearity=output_nonlinearity,
name="output"
)
# Note the deterministic=True argument. It makes sure that when getting
# actions from single observations, we do not update params in the
# batch normalization layers
action_var = L.get_output(l_output, deterministic=True)
self._output_layer = l_output
self._f_actions = ext.compile_function([l_obs.input_var], action_var)
super(DeterministicMLPPolicy, self).__init__(env_spec)
LasagnePowered.__init__(self, [l_output])
python类tanh()的实例源码
def build_recur_dropout(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p):
# Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout.
# first get some necessary dimensions or parameters
conv_window = 3
# shape = [batch, n-step, c_dim, char_length]
# construct convolution layer
# shape = [batch, n-step, c_filters, output_length]
cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full',
nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
# infer the pool size for pooling (pool size should go through all time step of cnn)
_, _, _, pool_size = cnn_layer.output_shape
# construct max pool layer
# shape = [batch, n-step, c_filters, 1]
pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
# reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))
# finally, concatenate the two incoming layers together.
# shape = [batch, n-step, c_filter&w_dim]
incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2)
# dropout for incoming
incoming = lasagne.layers.DropoutLayer(incoming, p=p, shared_axes=(1,))
ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
# according to Jozefowicz et al.(2015), init bias of forget gate to 1.
forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
# now use tanh for nonlinear function of cell, need to try pure linear cell
cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
nonlinearity=nonlinearities.tanh)
lstm_forward = LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping,
nonlinearity=nonlinearities.tanh, peepholes=False,
ingate=ingate_forward, outgate=outgate_forward,
forgetgate=forgetgate_forward, cell=cell_forward, p=p, name='forward')
ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
# according to Jozefowicz et al.(2015), init bias of forget gate to 1.
forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
# now use tanh for nonlinear function of cell, need to try pure linear cell
cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
nonlinearity=nonlinearities.tanh)
lstm_backward = LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping,
nonlinearity=nonlinearities.tanh, peepholes=False, backwards=True,
ingate=ingate_backward, outgate=outgate_backward,
forgetgate=forgetgate_backward, cell=cell_backward, p=p, name='backward')
# concatenate the outputs of forward and backward LSTMs to combine them.
bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm")
# shape = [batch, n-step, num_units]
bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn, p=p, shared_axes=(1,))
return ChainCRFLayer(bi_lstm_cnn, num_labels, mask_input=mask)
def build_RNN(architec, layer_input, layer_mask, num_units, grad_clipping):
def build_GRU(reset_input):
resetgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hiden_update = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh)
return GRULayer(layer_input, num_units, mask_input=layer_mask, grad_clipping=grad_clipping,
resetgate=resetgate, updategate=updategate, hidden_update=hiden_update,
reset_input=reset_input, only_return_final=True, p=0.5, name='GRU')
def build_LSTM():
ingate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
outgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
# according to Jozefowicz et al.(2015), init bias of forget gate to 1.
forgetgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
# now use tanh for nonlinear function of cell, need to try pure linear cell
cell = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh)
return LSTMLayer(layer_input, num_units, mask_input=layer_mask, grad_clipping=grad_clipping,
ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate,
peepholes=False, nonlinearity=nonlinearities.tanh,
only_return_final=True, p=0.5, name='LSTM')
def build_SGRU():
resetgate_hidden = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.GlorotUniform())
resetgate_input = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.GlorotUniform())
updategate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.GlorotUniform())
hidden_update = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh)
return SGRULayer(layer_input, num_units, mask_input=layer_mask, grad_clipping=grad_clipping,
resetgate_input=resetgate_input, resetgate_hidden=resetgate_hidden,
updategate=updategate, hidden_update=hidden_update,
only_return_final=True, p=0.5, name='SGRU')
if architec == 'gru0':
return build_GRU(False)
elif architec == 'gru1':
return build_GRU(True)
elif architec == 'lstm':
return build_LSTM()
elif architec == 'sgru':
return build_SGRU()
else:
raise ValueError('unkown architecture: %s' % architec)
def build_std_dropout_gru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p,
reset_input):
# Construct Bi-directional LSTM-CNNs-CRF with standard dropout.
# first get some necessary dimensions or parameters
conv_window = 3
# shape = [batch, n-step, c_dim, char_length]
incoming1 = lasagne.layers.DropoutLayer(incoming1, p=p)
# construct convolution layer
# shape = [batch, n-step, c_filters, output_length]
cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full',
nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
# infer the pool size for pooling (pool size should go through all time step of cnn)
_, _, _, pool_size = cnn_layer.output_shape
# construct max pool layer
# shape = [batch, n-step, c_filters, 1]
pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
# reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))
# finally, concatenate the two incoming layers together.
# shape = [batch, n-step, c_filter&w_dim]
incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2)
# dropout for incoming
incoming = lasagne.layers.DropoutLayer(incoming, p=0.2)
resetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=None, nonlinearity=nonlinearities.tanh)
gru_forward = GRULayer(incoming, num_units, mask_input=mask, resetgate=resetgate_forward,
updategate=updategate_forward, hidden_update=hidden_update_forward,
grad_clipping=grad_clipping, reset_input=reset_input, name='forward')
resetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=None, nonlinearity=nonlinearities.tanh)
gru_backward = GRULayer(incoming, num_units, mask_input=mask, backwards=True, resetgate=resetgate_backward,
updategate=updategate_backward, hidden_update=hidden_update_backward,
grad_clipping=grad_clipping, reset_input=reset_input, name='backward')
# concatenate the outputs of forward and backward LSTMs to combine them.
bi_gru_cnn = lasagne.layers.concat([gru_forward, gru_backward], axis=2, name="bi-gru")
bi_gru_cnn = lasagne.layers.DropoutLayer(bi_gru_cnn, p=p)
# reshape bi-rnn-cnn to [batch * max_length, num_units]
bi_gru_cnn = lasagne.layers.reshape(bi_gru_cnn, (-1, [2]))
# construct output layer (dense layer with softmax)
layer_output = lasagne.layers.DenseLayer(bi_gru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax,
name='softmax')
return layer_output
def build_std_dropout_sgru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p):
# Construct Bi-directional LSTM-CNNs-CRF with standard dropout.
# first get some necessary dimensions or parameters
conv_window = 3
# shape = [batch, n-step, c_dim, char_length]
incoming1 = lasagne.layers.DropoutLayer(incoming1, p=p)
# construct convolution layer
# shape = [batch, n-step, c_filters, output_length]
cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full',
nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
# infer the pool size for pooling (pool size should go through all time step of cnn)
_, _, _, pool_size = cnn_layer.output_shape
# construct max pool layer
# shape = [batch, n-step, c_filters, 1]
pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
# reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))
# finally, concatenate the two incoming layers together.
# shape = [batch, n-step, c_filter&w_dim]
incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2)
# dropout for incoming
incoming = lasagne.layers.DropoutLayer(incoming, p=0.2)
resetgate_input_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
resetgate_hidden_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=None, nonlinearity=nonlinearities.tanh)
sgru_forward = SGRULayer(incoming, num_units, mask_input=mask,
resetgate_input=resetgate_input_forward, resetgate_hidden=resetgate_hidden_forward,
updategate=updategate_forward, hidden_update=hidden_update_forward,
grad_clipping=grad_clipping, name='forward')
resetgate_input_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
resetgate_hidden_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=None, nonlinearity=nonlinearities.tanh)
sgru_backward = SGRULayer(incoming, num_units, mask_input=mask, backwards=True,
resetgate_input=resetgate_input_backward, resetgate_hidden=resetgate_hidden_backward,
updategate=updategate_backward, hidden_update=hidden_update_backward,
grad_clipping=grad_clipping, name='backward')
# concatenate the outputs of forward and backward LSTMs to combine them.
bi_sgru_cnn = lasagne.layers.concat([sgru_forward, sgru_backward], axis=2, name="bi-sgru")
bi_sgru_cnn = lasagne.layers.DropoutLayer(bi_sgru_cnn, p=p)
# reshape bi-rnn-cnn to [batch * max_length, num_units]
bi_sgru_cnn = lasagne.layers.reshape(bi_sgru_cnn, (-1, [2]))
# construct output layer (dense layer with softmax)
layer_output = lasagne.layers.DenseLayer(bi_sgru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax,
name='softmax')
return layer_output
def build_recur_dropout_gru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p,
reset_input):
# Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout.
# first get some necessary dimensions or parameters
conv_window = 3
# shape = [batch, n-step, c_dim, char_length]
# construct convolution layer
# shape = [batch, n-step, c_filters, output_length]
cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full',
nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
# infer the pool size for pooling (pool size should go through all time step of cnn)
_, _, _, pool_size = cnn_layer.output_shape
# construct max pool layer
# shape = [batch, n-step, c_filters, 1]
pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
# reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))
# finally, concatenate the two incoming layers together.
# shape = [batch, n-step, c_filter&w_dim]
incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2)
# dropout for incoming
incoming = lasagne.layers.DropoutLayer(incoming, p=0.2, shared_axes=(1,))
resetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=None, nonlinearity=nonlinearities.tanh)
gru_forward = GRULayer(incoming, num_units, mask_input=mask, resetgate=resetgate_forward,
updategate=updategate_forward, hidden_update=hidden_update_forward,
grad_clipping=grad_clipping, reset_input=reset_input, p=p, name='forward')
resetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=None, nonlinearity=nonlinearities.tanh)
gru_backward = GRULayer(incoming, num_units, mask_input=mask, backwards=True, resetgate=resetgate_backward,
updategate=updategate_backward, hidden_update=hidden_update_backward,
grad_clipping=grad_clipping, reset_input=reset_input, p=p, name='backward')
# concatenate the outputs of forward and backward LSTMs to combine them.
bi_gru_cnn = lasagne.layers.concat([gru_forward, gru_backward], axis=2, name="bi-gru")
# shape = [batch, n-step, num_units]
bi_gru_cnn = lasagne.layers.DropoutLayer(bi_gru_cnn, p=p, shared_axes=(1,))
# reshape bi-rnn-cnn to [batch * max_length, num_units]
bi_gru_cnn = lasagne.layers.reshape(bi_gru_cnn, (-1, [2]))
# construct output layer (dense layer with softmax)
layer_output = lasagne.layers.DenseLayer(bi_gru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax,
name='softmax')
return layer_output
def build_recur_dropout_sgru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p):
# Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout.
# first get some necessary dimensions or parameters
conv_window = 3
# shape = [batch, n-step, c_dim, char_length]
# construct convolution layer
# shape = [batch, n-step, c_filters, output_length]
cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full',
nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
# infer the pool size for pooling (pool size should go through all time step of cnn)
_, _, _, pool_size = cnn_layer.output_shape
# construct max pool layer
# shape = [batch, n-step, c_filters, 1]
pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
# reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))
# finally, concatenate the two incoming layers together.
# shape = [batch, n-step, c_filter&w_dim]
incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2)
# dropout for incoming
incoming = lasagne.layers.DropoutLayer(incoming, p=0.2, shared_axes=(1,))
resetgate_input_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
resetgate_hidden_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=None, nonlinearity=nonlinearities.tanh)
sgru_forward = SGRULayer(incoming, num_units, mask_input=mask,
resetgate_input=resetgate_input_forward, resetgate_hidden=resetgate_hidden_forward,
updategate=updategate_forward, hidden_update=hidden_update_forward,
grad_clipping=grad_clipping, p=p, name='forward')
resetgate_input_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
resetgate_hidden_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=None, nonlinearity=nonlinearities.tanh)
sgru_backward = SGRULayer(incoming, num_units, mask_input=mask, backwards=True,
resetgate_input=resetgate_input_backward, resetgate_hidden=resetgate_hidden_backward,
updategate=updategate_backward, hidden_update=hidden_update_backward,
grad_clipping=grad_clipping, p=p, name='backward')
# concatenate the outputs of forward and backward LSTMs to combine them.
bi_sgru_cnn = lasagne.layers.concat([sgru_forward, sgru_backward], axis=2, name="bi-sgru")
# shape = [batch, n-step, num_units]
bi_sgru_cnn = lasagne.layers.DropoutLayer(bi_sgru_cnn, p=p, shared_axes=(1,))
# reshape bi-rnn-cnn to [batch * max_length, num_units]
bi_sgru_cnn = lasagne.layers.reshape(bi_sgru_cnn, (-1, [2]))
# construct output layer (dense layer with softmax)
layer_output = lasagne.layers.DenseLayer(bi_sgru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax,
name='softmax')
return layer_output
def __init__(self, incoming, num_units, ingate=Gate(), forgetgate=Gate(),
cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(),
nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.),
backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0,
precompute_input=True, mask_input=None,
encoder_mask_input=None, attention=False, word_by_word=False, **kwargs):
super(CustomLSTMDecoder, self).__init__(incoming, num_units, ingate, forgetgate, cell, outgate, nonlinearity,
cell_init, hid_init, backwards, learn_init, peepholes, gradient_steps,
grad_clipping, False, precompute_input, mask_input, True,
**kwargs)
self.attention = attention
self.word_by_word = word_by_word
# encoder mask
self.encoder_mask_incoming_index = -1
if encoder_mask_input is not None:
self.input_layers.append(encoder_mask_input)
self.input_shapes.append(encoder_mask_input.output_shape)
self.encoder_mask_incoming_index = len(self.input_layers) - 1
# check encoder
if not isinstance(self.cell_init, CustomLSTMEncoder) \
or self.num_units != self.cell_init.num_units:
raise ValueError('cell_init must be CustomLSTMEncoder'
' and num_units should equal')
self.r_init = None
self.r_init = self.add_param(init.Constant(0.),
(1, num_units), name="r_init",
trainable=False, regularizable=False)
if self.word_by_word:
# rewrites
self.attention = True
if self.attention:
if not isinstance(encoder_mask_input, lasagne.layers.Layer):
raise ValueError('Attention mechnism needs encoder mask layer')
# initializes attention weights
self.W_y_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'V_pointer')
self.W_h_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_h_attend')
# doesn't need transpose
self.w_attend = self.add_param(init.Normal(0.1), (num_units, 1), 'v_pointer')
self.W_p_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_p_attend')
self.W_x_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_x_attend')
if self.word_by_word:
self.W_r_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_r_attend')
self.W_t_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_t_attend')
def network_generator(self, input_var, network_weights=None):
# Input layer
layers = []
n_blocks = int(np.log2(self.input_size / 8)) + 1 # end up with 8x8 output
layers.append(InputLayer(shape=(None, self.hidden_size), input_var=input_var, name='generator/input'))
# Dense layer up (from h to n*8*8)
layers.append(dense_layer(layers[-1], n_units=(8 * 8 * self.n_filters), name='generator/dense%d' % len(layers), network_weights=network_weights))
layers.append(ReshapeLayer(layers[-1], (-1, self.n_filters, 8, 8), name='generator/reshape%d' % len(layers)))
# Convolutional blocks (decoder)
for i_block in range(1, n_blocks+1):
layers.append(conv_layer(layers[-1], n_filters=self.n_filters, stride=1, name='generator/conv%d' % len(layers), network_weights=network_weights))
layers.append(conv_layer(layers[-1], n_filters=self.n_filters, stride=1, name='generator/conv%d' % len(layers), network_weights=network_weights))
if i_block != n_blocks:
layers.append(Upscale2DLayer(layers[-1], scale_factor=2, name='generator/upsample%d' % len(layers)))
# Final layer (make sure input images are in the range [-1, 1] if tanh used)
layers.append(conv_layer(layers[-1], n_filters=3, stride=1, name='generator/output', network_weights=network_weights, nonlinearity=sigmoid))
# Network in dictionary form
network = {layer.name: layer for layer in layers}
return network
# def network_generator_alt(self, input_var, network_weights=None):
#
# # Input layer
# layers = []
# n_blocks = int(np.log2(self.input_size / 8)) + 1 # end up with 8x8 output
# layers.append(InputLayer(shape=(None, self.hidden_size), input_var=input_var, name='generator/input'))
#
# # Dense layer up (from h to n*8*8)
# layers.append(dense_layer(layers[-1], n_units=(8 * 8 * self.n_filters*n_blocks), name='generator/dense%d' % len(layers), network_weights=network_weights, nonlinearity=elu, bn=True))
# layers.append(ReshapeLayer(layers[-1], (-1, self.n_filters*n_blocks, 8, 8), name='generator/reshape%d' % len(layers)))
#
# # Convolutional blocks (decoder)
# for i_block in range(1, n_blocks+1)[::-1]:
# # layers.append(conv_layer(layers[-1], n_filters=self.n_filters*(i_block), stride=1, name='generator/conv%d' % len(layers), network_weights=network_weights, bn=True))
# # layers.append(conv_layer(layers[-1], n_filters=self.n_filters*(i_block), stride=1, name='generator/conv%d' % len(layers), network_weights=network_weights, bn=True))
# if i_block != 1:
# layers.append(transposed_conv_layer(layers[-1], n_filters=self.n_filters*(i_block-1), stride=2, name='generator/upsample%d' % len(layers),
# output_size=8*2**(n_blocks-i_block+1), network_weights=network_weights, nonlinearity=elu, bn=True))
#
# # Final layer (make sure input images are in the range [-1, 1]
# layers.append(conv_layer(layers[-1], n_filters=3, stride=1, name='generator/output', network_weights=network_weights, nonlinearity=tanh, bn=False))
#
# # Network in dictionary form
# network = {layer.name: layer for layer in layers}
#
# return network
def __init__(
self,
env_spec,
hidden_sizes=(32, 32),
hidden_nonlinearity=NL.rectify,
hidden_W_init=LI.HeUniform(),
hidden_b_init=LI.Constant(0.),
output_nonlinearity=NL.tanh,
output_W_init=LI.Uniform(-3e-3, 3e-3),
output_b_init=LI.Uniform(-3e-3, 3e-3),
bn=False):
Serializable.quick_init(self, locals())
l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim))
l_hidden = l_obs
if bn:
l_hidden = batch_norm(l_hidden)
for idx, size in enumerate(hidden_sizes):
l_hidden = L.DenseLayer(
l_hidden,
num_units=size,
W=hidden_W_init,
b=hidden_b_init,
nonlinearity=hidden_nonlinearity,
name="h%d" % idx
)
if bn:
l_hidden = batch_norm(l_hidden)
l_output = L.DenseLayer(
l_hidden,
num_units=env_spec.action_space.flat_dim,
W=output_W_init,
b=output_b_init,
nonlinearity=output_nonlinearity,
name="output"
)
# Note the deterministic=True argument. It makes sure that when getting
# actions from single observations, we do not update params in the
# batch normalization layers
action_var = L.get_output(l_output, deterministic=True)
self._output_layer = l_output
self._f_actions = ext.compile_function([l_obs.input_var], action_var)
super(DeterministicMLPPolicy, self).__init__(env_spec)
LasagnePowered.__init__(self, [l_output])
def build_BiLSTM(incoming, num_units, mask=None, grad_clipping=0, precompute_input=True, peepholes=False, dropout=True,
in_to_out=False):
# construct the forward and backward rnns. Now, Ws are initialized by Glorot initializer with default arguments.
# Need to try other initializers for specific tasks.
# dropout for incoming
if dropout:
incoming = lasagne.layers.DropoutLayer(incoming, p=0.5)
ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
# according to Jozefowicz et al.(2015), init bias of forget gate to 1.
forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
# now use tanh for nonlinear function of cell, need to try pure linear cell
cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
nonlinearity=nonlinearities.tanh)
lstm_forward = lasagne.layers.LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping,
nonlinearity=nonlinearities.tanh, peepholes=peepholes,
precompute_input=precompute_input,
ingate=ingate_forward, outgate=outgate_forward,
forgetgate=forgetgate_forward, cell=cell_forward, name='forward')
ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
# according to Jozefowicz et al.(2015), init bias of forget gate to 1.
forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
# now use tanh for nonlinear function of cell, need to try pure linear cell
cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
nonlinearity=nonlinearities.tanh)
lstm_backward = lasagne.layers.LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping,
nonlinearity=nonlinearities.tanh, peepholes=peepholes,
precompute_input=precompute_input, backwards=True,
ingate=ingate_backward, outgate=outgate_backward,
forgetgate=forgetgate_backward, cell=cell_backward, name='backward')
# concatenate the outputs of forward and backward RNNs to combine them.
concat = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm")
# dropout for output
if dropout:
concat = lasagne.layers.DropoutLayer(concat, p=0.5)
if in_to_out:
concat = lasagne.layers.concat([concat, incoming], axis=2)
# the shape of BiRNN output (concat) is (batch_size, input_length, 2 * num_hidden_units)
return concat
def build_BiGRU(incoming, num_units, mask=None, grad_clipping=0, precompute_input=True, dropout=True, in_to_out=False):
# construct the forward and backward grus. Now, Ws are initialized by Glorot initializer with default arguments.
# Need to try other initializers for specific tasks.
# dropout for incoming
if dropout:
incoming = lasagne.layers.DropoutLayer(incoming, p=0.5)
# according to Jozefowicz et al.(2015), init bias of forget gate to 1.
resetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
# now use tanh for nonlinear function of hidden gate
hidden_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
nonlinearity=nonlinearities.tanh)
gru_forward = lasagne.layers.GRULayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping,
precompute_input=precompute_input,
resetgate=resetgate_forward, updategate=updategate_forward,
hidden_update=hidden_forward, name='forward')
# according to Jozefowicz et al.(2015), init bias of forget gate to 1.
resetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
W_cell=lasagne.init.Uniform(range=0.1))
# now use tanh for nonlinear function of hidden gate
hidden_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
nonlinearity=nonlinearities.tanh)
gru_backward = lasagne.layers.GRULayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping,
precompute_input=precompute_input, backwards=True,
resetgate=resetgate_backward, updategate=updategate_backward,
hidden_update=hidden_backward, name='backward')
# concatenate the outputs of forward and backward GRUs to combine them.
concat = lasagne.layers.concat([gru_forward, gru_backward], axis=2, name="bi-gru")
# dropout for output
if dropout:
concat = lasagne.layers.DropoutLayer(concat, p=0.5)
if in_to_out:
concat = lasagne.layers.concat([concat, incoming], axis=2)
# the shape of BiRNN output (concat) is (batch_size, input_length, 2 * num_hidden_units)
return concat
def __init__(
self, incomings, num_units,
W_g=init.Normal(0.1),
W_h=init.Normal(0.1),
W_v=init.Normal(0.1),
W_s=init.Normal(0.1),
W_p=init.Normal(0.1),
nonlinearity=nonlinearities.tanh,
nonlinearity_atten=nonlinearities.softmax,
**kwargs
):
super(AttenLayer, self).__init__(incomings, **kwargs)
self.batch_size = self.input_shapes[0][0] # None
num_inputs = self.input_shapes[2][1] # k
feature_dim = self.input_shapes[0][1] # d
self.num_units = num_units
self.nonlinearity = nonlinearity
self.nonlinearity_atten = nonlinearity_atten
self.W_h_to_attenGate = self.add_param(
W_h, (num_inputs, 1),
name='W_h_to_atten'
)
self.W_g_to_attenGate = self.add_param(
W_g,
(feature_dim, num_inputs),
name='W_g_to_atten'
)
self.W_v_to_attenGate = self.add_param(
W_v,
(feature_dim, num_inputs),
name='W_v_to_atten'
)
self.W_s_to_attenGate = self.add_param(
W_s,
(feature_dim, num_inputs),
name='W_s_to_atten'
)
self.W_p = self.add_param(
W_p,
(feature_dim, num_units),
name='W_p_to_atten'
)
self.num_inputs = num_inputs
def __init__(self, conf):
self.conf = conf
if self.conf.act == "linear":
self.conf.act = linear
elif self.conf.act == "sigmoid":
self.conf.act = sigmoid
elif self.conf.act == "relu":
self.conf.act = rectify
elif self.conf.act == "tanh":
self.conf.act = tanh
else:
raise ValueError("Unknown activation function", self.conf.act)
input_var_first = T.matrix('inputs1')
input_var_second = T.matrix('inputs2')
target_var = T.matrix('targets')
# create network
self.autoencoder, encoder_first, encoder_second = self.__create_toplogy__(input_var_first, input_var_second)
self.out = get_output(self.autoencoder)
loss = squared_error(self.out, target_var)
loss = loss.mean()
params = get_all_params(self.autoencoder, trainable=True)
updates = nesterov_momentum(loss, params, learning_rate=self.conf.lr, momentum=self.conf.momentum)
# training function
self.train_fn = theano.function([input_var_first, input_var_second, target_var], loss, updates=updates)
# fuction to reconstruct
test_reconstruction = get_output(self.autoencoder, deterministic=True)
self.reconstruction_fn = theano.function([input_var_first, input_var_second], test_reconstruction)
# encoding function
test_encode = get_output([encoder_first, encoder_second], deterministic=True)
self.encoding_fn = theano.function([input_var_first, input_var_second], test_encode)
# utils
blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
self.blas_nrm2 = blas('nrm2', np.array([], dtype=float))
self.blas_scal = blas('scal', np.array([], dtype=float))
# load weights if necessary
if self.conf.load_model is not None:
self.load_model()
def run_task(*_):
trpo_stepsize = 0.01
trpo_subsample_factor = 0.2
env = PointGatherEnv(apple_reward=10,bomb_cost=1,n_apples=2, activity_range=6)
policy = GaussianMLPPolicy(env.spec,
hidden_sizes=(64,32)
)
baseline = GaussianMLPBaseline(
env_spec=env.spec,
regressor_args={
'hidden_sizes': (64,32),
'hidden_nonlinearity': NL.tanh,
'learn_std':False,
'step_size':trpo_stepsize,
'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
}
)
safety_baseline = GaussianMLPBaseline(
env_spec=env.spec,
regressor_args={
'hidden_sizes': (64,32),
'hidden_nonlinearity': NL.tanh,
'learn_std':False,
'step_size':trpo_stepsize,
'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
},
target_key='safety_returns',
)
safety_constraint = GatherSafetyConstraint(max_value=0.1, baseline=safety_baseline)
algo = CPO(
env=env,
policy=policy,
baseline=baseline,
safety_constraint=safety_constraint,
safety_gae_lambda=1,
batch_size=50000,
max_path_length=15,
n_itr=100,
gae_lambda=0.95,
discount=0.995,
step_size=trpo_stepsize,
optimizer_args={'subsample_factor':trpo_subsample_factor},
#plot=True,
)
algo.train()
def __init__(
self,
env_spec,
hidden_sizes=(32, 32),
hidden_nonlinearity=NL.rectify,
hidden_W_init=LI.HeUniform(),
hidden_b_init=LI.Constant(0.),
output_nonlinearity=NL.tanh,
output_W_init=LI.Uniform(-3e-3, 3e-3),
output_b_init=LI.Uniform(-3e-3, 3e-3),
bn=False):
Serializable.quick_init(self, locals())
l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim))
l_hidden = l_obs
if bn:
l_hidden = batch_norm(l_hidden)
for idx, size in enumerate(hidden_sizes):
l_hidden = L.DenseLayer(
l_hidden,
num_units=size,
W=hidden_W_init,
b=hidden_b_init,
nonlinearity=hidden_nonlinearity,
name="h%d" % idx
)
if bn:
l_hidden = batch_norm(l_hidden)
l_output = L.DenseLayer(
l_hidden,
num_units=env_spec.action_space.flat_dim,
W=output_W_init,
b=output_b_init,
nonlinearity=output_nonlinearity,
name="output"
)
# Note the deterministic=True argument. It makes sure that when getting
# actions from single observations, we do not update params in the
# batch normalization layers
action_var = L.get_output(l_output, deterministic=True)
self._output_layer = l_output
self._f_actions = ext.compile_function([l_obs.input_var], action_var)
super(DeterministicMLPPolicy, self).__init__(env_spec)
LasagnePowered.__init__(self, [l_output])
def build(input_height, input_width, concat_var):
"""
Build the discriminator, all weights initialized from scratch
:param input_width:
:param input_height:
:param concat_var: Theano symbolic tensor variable
:return: Dictionary that contains the discriminator
"""
net = {'input': InputLayer((None, 4, input_height, input_width), input_var=concat_var)}
print "Input: {}".format(net['input'].output_shape[1:])
net['merge'] = ConvLayer(net['input'], 3, 1, pad=0, flip_filters=False)
print "merge: {}".format(net['merge'].output_shape[1:])
net['conv1'] = ConvLayer(net['merge'], 32, 3, pad=1)
print "conv1: {}".format(net['conv1'].output_shape[1:])
net['pool1'] = PoolLayer(net['conv1'], 4)
print "pool1: {}".format(net['pool1'].output_shape[1:])
net['conv2_1'] = ConvLayer(net['pool1'], 64, 3, pad=1)
print "conv2_1: {}".format(net['conv2_1'].output_shape[1:])
net['conv2_2'] = ConvLayer(net['conv2_1'], 64, 3, pad=1)
print "conv2_2: {}".format(net['conv2_2'].output_shape[1:])
net['pool2'] = PoolLayer(net['conv2_2'], 2)
print "pool2: {}".format(net['pool2'].output_shape[1:])
net['conv3_1'] = nn.weight_norm(ConvLayer(net['pool2'], 64, 3, pad=1))
print "conv3_1: {}".format(net['conv3_1'].output_shape[1:])
net['conv3_2'] = nn.weight_norm(ConvLayer(net['conv3_1'], 64, 3, pad=1))
print "conv3_2: {}".format(net['conv3_2'].output_shape[1:])
net['pool3'] = PoolLayer(net['conv3_2'], 2)
print "pool3: {}".format(net['pool3'].output_shape[1:])
net['fc4'] = DenseLayer(net['pool3'], num_units=100, nonlinearity=tanh)
print "fc4: {}".format(net['fc4'].output_shape[1:])
net['fc5'] = DenseLayer(net['fc4'], num_units=2, nonlinearity=tanh)
print "fc5: {}".format(net['fc5'].output_shape[1:])
net['prob'] = DenseLayer(net['fc5'], num_units=1, nonlinearity=sigmoid)
print "prob: {}".format(net['prob'].output_shape[1:])
return net
def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
lstm_size=250, win=T.iscalar('theta)'),
output_classes=26, w_init_fn=GlorotUniform, use_peepholes=False, use_blstm=True):
weights, biases, shapes, nonlinearities = dbn
gate_parameters = Gate(
W_in=w_init_fn, W_hid=w_init_fn,
b=las.init.Constant(0.))
cell_parameters = Gate(
W_in=w_init_fn, W_hid=w_init_fn,
# Setting W_cell to None denotes that no cell connection will be used.
W_cell=None, b=las.init.Constant(0.),
# By convention, the cell nonlinearity is tanh in an LSTM.
nonlinearity=tanh)
l_in = InputLayer(input_shape, input_var, 'input')
l_mask = InputLayer(mask_shape, mask_var, 'mask')
symbolic_batchsize = l_in.input_var.shape[0]
symbolic_seqlen = l_in.input_var.shape[1]
l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
l_encoder = create_pretrained_encoder(l_reshape1, weights, biases,
shapes,
nonlinearities,
['fc1', 'fc2', 'fc3', 'bottleneck'])
encoder_len = las.layers.get_output_shape(l_encoder)[-1]
l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
l_delta = DeltaLayer(l_reshape2, win, name='delta')
if use_blstm:
l_lstm, l_lstm_back = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'blstm1',
use_peepholes)
# We'll combine the forward and backward layer output by summing.
# Merge layers take in lists of layers to merge as input.
l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1')
# reshape, flatten to 2 dimensions to run softmax on all timesteps
l_reshape3 = ReshapeLayer(l_sum1, (-1, lstm_size), name='reshape3')
else:
l_lstm = create_lstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes)
l_reshape3 = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape3')
# Now, we can apply feed-forward layers as usual.
# We want the network to predict a classification for the sequence,
# so we'll use a the number of classes.
l_softmax = DenseLayer(
l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax')
l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output')
return l_out
def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
lstm_size=250, output_classes=26):
dbn_layers = dbn.get_all_layers()
weights = []
biases = []
weights.append(dbn_layers[1].W.astype('float32'))
weights.append(dbn_layers[2].W.astype('float32'))
weights.append(dbn_layers[3].W.astype('float32'))
weights.append(dbn_layers[4].W.astype('float32'))
biases.append(dbn_layers[1].b.astype('float32'))
biases.append(dbn_layers[2].b.astype('float32'))
biases.append(dbn_layers[3].b.astype('float32'))
biases.append(dbn_layers[4].b.astype('float32'))
gate_parameters = Gate(
W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
b=las.init.Constant(0.))
cell_parameters = Gate(
W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
# Setting W_cell to None denotes that no cell connection will be used.
W_cell=None, b=las.init.Constant(0.),
# By convention, the cell nonlinearity is tanh in an LSTM.
nonlinearity=tanh)
l_in = InputLayer(input_shape, input_var, 'input')
l_mask = InputLayer(mask_shape, mask_var, 'mask')
symbolic_batchsize = l_in.input_var.shape[0]
symbolic_seqlen = l_in.input_var.shape[1]
l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
l_encoder = create_pretrained_encoder(weights, biases, l_reshape1)
encoder_len = las.layers.get_output_shape(l_encoder)[-1]
l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
# l_delta = DeltaLayer(l_reshape2, win, name='delta')
# l_lstm = create_lstm(l_reshape2, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1')
l_lstm, l_lstm_back = create_blstm(l_reshape2, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1')
# We'll combine the forward and backward layer output by summing.
# Merge layers take in lists of layers to merge as input.
l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1')
l_forward_slice1 = SliceLayer(l_sum1, -1, 1, name='slice1')
# Now, we can apply feed-forward layers as usual.
# We want the network to predict a classification for the sequence,
# so we'll use a the number of classes.
l_out = DenseLayer(
l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output')
return l_out
def create_pretrained_substream(weights, biases, input_shape, input_var, mask_shape, mask_var, name,
lstm_size=250, win=T.iscalar('theta'), nonlinearity=rectify,
w_init_fn=las.init.Orthogonal(), use_peepholes=True):
gate_parameters = Gate(
W_in=w_init_fn, W_hid=w_init_fn,
b=las.init.Constant(0.))
cell_parameters = Gate(
W_in=w_init_fn, W_hid=w_init_fn,
# Setting W_cell to None denotes that no cell connection will be used.
W_cell=None, b=las.init.Constant(0.),
# By convention, the cell nonlinearity is tanh in an LSTM.
nonlinearity=tanh)
l_input = InputLayer(input_shape, input_var, 'input_'+name)
l_mask = InputLayer(mask_shape, mask_var, 'mask')
symbolic_batchsize_raw = l_input.input_var.shape[0]
symbolic_seqlen_raw = l_input.input_var.shape[1]
l_reshape1_raw = ReshapeLayer(l_input, (-1, input_shape[-1]), name='reshape1_'+name)
l_encoder_raw = create_pretrained_encoder(l_reshape1_raw, weights, biases,
[2000, 1000, 500, 50],
[nonlinearity, nonlinearity, nonlinearity, linear],
['fc1_'+name, 'fc2_'+name, 'fc3_'+name, 'bottleneck_'+name])
input_len = las.layers.get_output_shape(l_encoder_raw)[-1]
l_reshape2 = ReshapeLayer(l_encoder_raw,
(symbolic_batchsize_raw, symbolic_seqlen_raw, input_len),
name='reshape2_'+name)
l_delta = DeltaLayer(l_reshape2, win, name='delta_'+name)
l_lstm = LSTMLayer(
l_delta, int(lstm_size), peepholes=use_peepholes,
# We need to specify a separate input for masks
mask_input=l_mask,
# Here, we supply the gate parameters for each gate
ingate=gate_parameters, forgetgate=gate_parameters,
cell=cell_parameters, outgate=gate_parameters,
# We'll learn the initialization and use gradient clipping
learn_init=True, grad_clipping=5., name='lstm_'+name)
return l_lstm