def shared_dataset(data_xy, borrow=True):
""" Function that loads the dataset into shared variables
The reason we store our dataset in shared variables is to allow
Theano to copy it into the GPU memory (when code is run on GPU).
Since copying data into the GPU is slow, copying a minibatch everytime
is needed (the default behaviour if the data is not in a shared
variable) would lead to a large decrease in performance.
"""
data_x, data_y = data_xy
shared_x = theano.shared(numpy.asarray(data_x,
dtype=theano.config.floatX),
borrow=borrow)
shared_y = theano.shared(numpy.asarray(data_y,
dtype=theano.config.floatX),
borrow=borrow)
# When storing data on the GPU it has to be stored as floats
# therefore we will store the labels as ``floatX`` as well
# (``shared_y`` does exactly that). But during our computations
# we need them as ints (we use labels as index, and if they are
# floats it doesn't make sense) therefore instead of returning
# ``shared_y`` we will have to cast it to int. This little hack
# lets ous get around this issue
return shared_x, T.cast(shared_y, 'int32')
python类cast()的实例源码
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def adamax_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
for p, g in zip(params, grads):
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
if mom1>0:
v_t = mom1*v + (1. - mom1)*g
updates.append((v,v_t))
else:
v_t = g
mg_t = T.maximum(mom2*mg, abs(g))
g_t = v_t / (mg_t + 1e-6)
p_t = p - lr * g_t
updates.append((mg, mg_t))
updates.append((p, p_t))
return updates
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1.), th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def forward(self, x, mask, hc):
n_in, n_out, activation = self.n_in, self.n_out_t, self.activation
if hc.ndim > 1:
c_tm1 = hc[:, :n_out]
h_tm1 = hc[:, n_out:]
else:
c_tm1 = hc[:n_out]
h_tm1 = hc[n_out:]
in_t = self.in_gate.forward(x,h_tm1)
forget_t = self.forget_gate.forward(x,h_tm1)
out_t = self.out_gate.forward(x, h_tm1)
c_t = forget_t * c_tm1 + in_t * self.input_layer.forward(x,h_tm1)
c_t = c_t * mask.dimshuffle(0, 'x')
c_t = T.cast(c_t, 'float32')
h_t = out_t * T.tanh(c_t)
h_t = h_t * mask.dimshuffle(0, 'x')
h_t = T.cast(h_t, 'float32')
if hc.ndim > 1:
return T.concatenate([ c_t, h_t ], axis=1)
else:
return T.concatenate([ c_t, h_t ])
def backward(self, x, mask, hc):
n_in, n_out, activation = self.n_in, self.n_out_t, self.activation
if hc.ndim > 1:
c_tm1 = hc[:, :n_out]
h_tm1 = hc[:, n_out:]
else:
c_tm1 = hc[:n_out]
h_tm1 = hc[n_out:]
in_t = self.in_gate_b.forward(x,h_tm1)
forget_t = self.forget_gate_b.forward(x,h_tm1)
out_t = self.out_gate_b.forward(x, h_tm1)
c_t = forget_t * c_tm1 + in_t * self.input_layer_b.forward(x,h_tm1)
c_t = c_t * mask.dimshuffle(0, 'x')
c_t = T.cast(c_t, 'float32')
h_t = out_t * T.tanh(c_t)
h_t = h_t * mask.dimshuffle(0, 'x')
h_t = T.cast(h_t, 'float32')
if hc.ndim > 1:
return T.concatenate([ c_t, h_t ], axis=1)
else:
return T.concatenate([ c_t, h_t ])
def get_parent_state(self, children_states, node_type, use_dropout: bool, iteration_number) -> tuple:
layer_input = T.flatten(children_states)
nn_out = self.__compute_layer_output(layer_input, node_type, use_dropout, iteration_number)
encoder_input = T.flatten(T.concatenate((children_states, nn_out))) * self.__ae_noise
encoding = T.tanh(T.dot(encoder_input, self.__encoder_weights[node_type]))
decoded = T.tanh(T.dot(encoding, self.__decoder_weights))
decoded /= decoded.norm(2) / layer_input.norm(2)
output_reconstruction = self.__compute_layer_output(decoded, node_type, use_dropout, iteration_number)
reconstruction_cos = T.dot(nn_out[0], output_reconstruction[0])
children_reconstruction_cos = T.dot(decoded, layer_input)
additional_objective = reconstruction_cos + children_reconstruction_cos
constrain_usage_pct = T.cast(1. - T.pow(self.__hyperparameters['constrain_intro_rate'], iteration_number),
theano.config.floatX)
return nn_out[0], constrain_usage_pct * additional_objective
def conv2d_grad(topgrad, output_shape, filters, border_mode, strides):
if (border_mode==BorderMode.same):
#'half' kernel width padding results in outputs of the same
#dimensions as input
border_mode=BorderMode.half
assert filters.shape[2]%2 == 1 and filters.shape[3]%2 == 1,\
"haven't handled even filter shapes for border mode 'half'"
op = T.nnet.abstract_conv.AbstractConv2d_gradInputs(
imshp=output_shape,
kshp=filters.shape,
subsample=strides,
border_mode=border_mode,
filter_flip=True)
topgrad=T.cast(topgrad, dtype=theano.config.floatX)
belowgrad = op(kern=filters, topgrad=topgrad, shape=output_shape[2:])
return belowgrad
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def adam_conditional_updates(params, cost, mincost, lr=0.001, mom1=0.9, mom2=0.999): # if cost is less than mincost, don't do update
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, ifelse(cost<mincost,v,v_t)))
updates.append((mg, ifelse(cost<mincost,mg,mg_t)))
updates.append((p, ifelse(cost<mincost,p,p_t)))
updates.append((t, ifelse(cost<mincost,t,t+1)))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
if self.nonlinearity is not None:
return self.nonlinearity(activation)
else:
return activation
def shared_dataset(data_xy, borrow=True):
""" Function that loads the dataset into shared variables
The reason we store our dataset in shared variables is to allow
Theano to copy it into the GPU memory (when code is run on GPU).
Since copying data into the GPU is slow, copying a minibatch everytime
is needed (the default behaviour if the data is not in a shared
variable) would lead to a large decrease in performance.
"""
data_x, data_y = data_xy
shared_x = theano.shared(np.asarray(data_x,
dtype=theano.config.floatX),
borrow=borrow)
shared_y = theano.shared(np.asarray(data_y,
dtype=theano.config.floatX),
borrow=borrow)
return shared_x, T.cast(shared_y, 'int32')
def shared_dataset(data_xy, borrow=True):
""" Function that loads the dataset into shared variables
The reason we store our dataset in shared variables is to allow
Theano to copy it into the GPU memory (when code is run on GPU).
Since copying data into the GPU is slow, copying a minibatch everytime
is needed (the default behaviour if the data is not in a shared
variable) would lead to a large decrease in performance.
"""
data_x, data_y = data_xy
shared_x = theano.shared(np.asarray(data_x,
dtype=theano.config.floatX),
borrow=borrow)
shared_y = theano.shared(np.asarray(data_y,
dtype=theano.config.floatX),
borrow=borrow)
return shared_x, T.cast(shared_y, 'int32')
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev):
active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
active_next = T.cast(T.minimum(
T.maximum(
active + 1,
T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
), log_p_curr.shape[0]), 'int32')
common_factor = T.max(log_p_prev[:active])
p_prev = T.exp(log_p_prev[:active] - common_factor)
_p_prev = zeros[:active_next]
# copy over
_p_prev = T.set_subtensor(_p_prev[:active], p_prev)
# previous transitions
_p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
# skip transitions
_p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
updated_log_p_prev = T.log(_p_prev) + common_factor
log_p_next = T.set_subtensor(
zeros[:active_next],
log_p_curr[:active_next] + updated_log_p_prev
)
return active_next, log_p_next
def _buildModel(self):
self.updates_ack= True
X = T.matrix('X', dtype=config.floatX)
Y = T.matrix('Y', dtype=config.floatX)
X.tag.test_value, Y.tag.test_value = self._fakeData()
#output_params_t= T.nnet.sigmoid(self._LinearNL(self.tWeights['W_lr'], self.tWeights['b_lr'], X, onlyLinear=True))
output_params_t= T.nnet.sigmoid(self._BNlayer(self.tWeights['W_lr'], self.tWeights['b_lr'], X, validation=False, onlyLinear=True))
nll_t = T.nnet.binary_crossentropy(output_params_t, Y).sum()
#output_params_e = T.nnet.sigmoid(self._LinearNL(self.tWeights['W_lr'], self.tWeights['b_lr'], X, onlyLinear=True))
output_params_e= T.nnet.sigmoid(self._BNlayer(self.tWeights['W_lr'], self.tWeights['b_lr'], X, validation=True, onlyLinear=True))
nll_e = T.nnet.binary_crossentropy(output_params_e, Y).sum()
if not self.params['validate_only']:
model_params = self._getModelParams()
print len(self.updates),' extraneous updates'
optimizer_up, norm_list = self._setupOptimizer(nll_t,
model_params,
lr=self.params['lr'],
divide_grad = T.cast(X.shape[0],config.floatX))
optimizer_up+=self.updates
self.train = theano.function([X,Y], [nll_t,self.tWeights['_lr_BN_running_mean'], self.tWeights['_lr_BN_running_var']], updates = optimizer_up)
self.evaluate = theano.function([X,Y],nll_e)
def softmax_and_sample(logits):
old_shape = logits.shape
flattened_logits = logits.reshape((-1, logits.shape[logits.ndim-1]))
samples = T.cast(
srng.multinomial(pvals=T.nnet.softmax(flattened_logits)),
theano.config.floatX
).reshape(old_shape)
return T.argmax(samples, axis=samples.ndim-1)
# TODO: Have a look at this benchmark:
# https://github.com/MaximumEntropy/cudnn_rnn_theano_benchmarks
def centered_softplus(x):
return T.nnet.softplus(x) - np.cast[th.config.floatX](np.log(2.))
def centered_softplus(x):
return T.nnet.softplus(x) - np.cast[th.config.floatX](np.log(2.))