def __init__(self,rng, W=None,m=1.0, n_samples=50,shape=None,batch_size=1000):
if W is None:
W = numpy.asarray(rng.uniform(
low=-numpy.sqrt(6. / (shape[0] + shape[1])),
high=numpy.sqrt(6. / (shape[0] + shape[1])),
size=(shape[0], shape[1])), dtype=theano.config.floatX)
self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True)
self.batch_size = batch_size
self.n_ht = W.shape[0]
self.m = m
self.n_samples = n_samples
self.csrng = CURAND_RandomStreams(123)
mask = self.csrng.uniform(size=(self.n_samples,1),low=0.0,high=1.0,dtype=theano.config.floatX)
self.rfun = theano.function([],mask.argsort(axis=0))
self.alpha = T.constant(1.0/numpy.arange(start=1,stop=self.n_ht + 1,step=1))
self.weights = [self.W]
self.biases = []
python类constant()的实例源码
def get_updates_rmsprop(self, cost, params, rho=0.9, eps=1e-8):
lr = self.lr
print(' - RMSprop: lr = %.2e' % (lr.get_value(borrow=True)))
one = T.constant(1.)
grads = T.grad(cost=cost, wrt=params)
updates = []
for p, g in zip(params, grads):
value = p.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
accu_new = rho * accu + (one - rho) * g ** 2
gradient_scaling = T.sqrt(accu_new + eps)
g = g / gradient_scaling
updates.append((accu, accu_new))
updates.append((p, p - lr * g))
return updates
def careful_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2):
"""
RMSProp with gradient clipping.
:param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled.
:return: updates
"""
grads = get_or_compute_grads(loss_or_grads, params)
updates = OrderedDict()
grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon)
# Using theano constant to prevent upcasting of float32
one = T.constant(1)
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
accu_new = rho * accu + (one - rho) * grad ** 2
updates[accu] = accu_new
updates[param] = param - (learning_rate * grad /
T.sqrt(accu_new + epsilon))
return updates
def adadelta(loss, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
grad_shared_flat, flat_grad, unflat_grads = flat_unflat_grads(loss, params)
grad_updates = [(grad_shared_flat, flat_grad)]
one = T.constant(1)
param_updates = list()
for p, g in zip(params, unflat_grads):
value = p.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
delta_accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
accu_new = rho * accu + (one - rho) * g ** 2
update = g * T.sqrt(delta_accu + epsilon) / T.sqrt(accu_new + epsilon)
delta_accu_new = rho * delta_accu + (one - rho) * update ** 2
param_updates += [(accu, accu_new)]
param_updates += [(p, p - learning_rate * update)]
param_updates += [(delta_accu, delta_accu_new)]
return grad_updates, param_updates, grad_shared_flat
def adam(loss, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
grad_shared_flat, flat_grad, unflat_grads = flat_unflat_grads(loss, params)
grad_updates = [(grad_shared_flat, flat_grad)]
t_prev = theano.shared(np.array(0, dtype=theano.config.floatX))
one = T.constant(1)
t = t_prev + one
a_t = learning_rate * T.sqrt(one - beta2 ** t) / (one - beta1 ** t)
param_updates = list()
for p, g in zip(params, unflat_grads):
value = p.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
m_t = beta1 * m_prev + (one - beta1) * g
v_t = beta2 * v_prev + (one - beta2) * g ** 2
step = a_t * m_t / (T.sqrt(v_t) + epsilon)
param_updates += [(m_prev, m_t), (v_prev, v_t), (p, p - step)]
param_updates += [(t_prev, t)]
return grad_updates, param_updates, grad_shared_flat
def adamax(loss, params, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8):
grad_shared_flat, flat_grad, unflat_grads = flat_unflat_grads(loss, params)
grad_updates = [(grad_shared_flat, flat_grad)]
t_prev = theano.shared(np.array(0, dtype=theano.config.floatX))
one = T.constant(1)
t = t_prev + one
a_t = learning_rate / (one - beta1 ** t)
param_updates = list()
for p, g in zip(params, unflat_grads):
value = p.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
m_t = beta1 * m_prev + (one - beta1) * g
u_t = T.maximum(beta2 * u_prev, abs(g))
step = a_t * m_t / (u_t + epsilon)
param_updates += [(m_prev, m_t), (u_prev, u_t), (p, p - step)]
param_updates += [(t_prev, t)]
return grad_updates, param_updates, grad_shared_flat
def build_model(model_):
global fn_predict, fn_record
global g_ozer, g_mdl
g_ozer = dict(simple=VanillaSGD, adam=AdamSGD)[OZER]()
g_ozer.lr = LEARN_RATE
s_x = T.tensor4('x')
s_y = T.ivector('y')
s_pdpo = T.scalar()
s_out = model_(s_x, s_pdpo)
s_y_onehot = T.extra_ops.to_one_hot(s_y, len(g_dataset.label_map))
s_loss = T.mean(-s_y_onehot*T.log(s_out + 1e-3))
s_accr = T.mean( T.switch(
T.eq(T.argmax(s_out, axis=1), T.argmax(s_y_onehot, axis=1)), 1, 0))
no_dropout = [(s_pdpo, T.constant(0., dtype=th.config.floatX))]
fn_predict = th.function(
[s_x, s_y],
{'pred':s_out, 'accr':s_accr, 'loss':s_loss},
givens=no_dropout, profile=PROFILE)
rec_fetches = {
'x': s_x, 'y': s_y,
'pred': s_out}
rec_fetches.update(g_mdl.params_di)
fn_record = th.function(
[s_x, s_y], rec_fetches, givens=no_dropout, profile=PROFILE)
g_ozer.compile(
[s_x, s_y],
s_loss,
g_mdl.params_di.values(),
fetches_={'pred': s_out, 'loss': s_loss, 'accr': s_accr},
givens_=[(s_pdpo, T.constant(TRAIN_PDPO, dtype=th.config.floatX))],
profile_=PROFILE)
def get_updates(self, learning_rate, params, grads, lr_scalers):
"""Compute the parameters' updates.
"""
t_prev = theano.shared(floatX(0.))
updates = OrderedDict()
# Using theano constant to prevent upcasting of float32
one = T.constant(1)
t = t_prev + 1
a_t = learning_rate*T.sqrt(one-self.beta2**t)/(one-self.beta1**t)
for param, g_t in zip(params, grads):
value = param.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
m_t = self.beta1*m_prev + (one-self.beta1)*g_t
v_t = self.beta2*v_prev + (one-self.beta2)*g_t**2
step = a_t*m_t/(T.sqrt(v_t) + self.epsilon)
updates[m_prev] = m_t
updates[v_prev] = v_t
new_param = param - step
if self.max_colm_norm and param.name in ["W", "w"]:
new_param_final = norm_constraint(tensor_var=new_param,
max_norm=self.max_norm)
else:
new_param_final = new_param
updates[param] = new_param_final
updates[t_prev] = t
return updates
def get_updates(self, learning_rate, params, grads, lr_scalers):
"""Compute the parameters' updates.
"""
t_prev = theano.shared(floatX(0.))
updates = OrderedDict()
# Using theano constant to prevent upcasting of float32
one = T.constant(1)
t = t_prev + 1
a_t = learning_rate/(one-self.beta1**t)
for param, g_t in zip(params, grads):
value = param.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
m_t = self.beta1*m_prev + (one-self.beta1)*g_t
u_t = T.maximum(self.beta2*u_prev, abs(g_t))
step = a_t*m_t/(u_t + self.epsilon)
updates[m_prev] = m_t
updates[u_prev] = u_t
new_param = param - step
if self.max_colm_norm and param.name in ["W", "w"]:
new_param_final = norm_constraint(tensor_var=new_param,
max_norm=self.max_norm)
else:
new_param_final = new_param
updates[param] = new_param_final
updates[t_prev] = t
return updates
def dropout_from_layer(rng, layer_output, p):
"""
p: float. The probablity of dropping a unit.
"""
srng = theano.tensor.shared_randomstreams.RandomStreams(
rng.randint(99999))
one = T.constant(1)
retain_prob = one - p
mask = srng.binomial(n=1, p=retain_prob, size=layer_output.shape,
dtype=layer_output.dtype)
output = layer_output * mask
return output
def __init__(self, rng, input, dropout_rate, rescale):
"""
rescale: Boolean. Can be only used when applying dropout.
"""
if rescale:
one = T.constant(1)
retain_prob = one - dropout_rate
input /= retain_prob
super(DropoutIdentityHiddenLayer, self).__init__(rng=rng, input=input)
if dropout_rate > 0.:
self.output = dropout_from_layer(rng, self.output, p=dropout_rate)
def __init__(self, rng, input, n_in, n_out, dropout_rate, rescale,
W=None, b=None, b_v=0., activation=None):
"""
rescale: Boolean. Can be only used when applying dropout.
"""
if rescale:
one = T.constant(1)
retain_prob = one - dropout_rate
input /= retain_prob
super(DropoutHiddenLayer, self).__init__(
input=input, n_in=n_in, n_out=n_out, W=W, b=b,
activation=activation, rng=rng)
if dropout_rate > 0.:
self.output = dropout_from_layer(rng, self.output, p=dropout_rate)
def step_infer(self, *params):
model = self.model
params = list(params)
rs = params[:model.n_layers]
qs = params[model.n_layers:2*model.n_layers]
y = params[2*model.n_layers]
params = params[1+2*model.n_layers:]
prior_params = model.get_prior_params(*params)
hs = []
new_qs = []
for l, (q, r) in enumerate(zip(qs, rs)):
h = (r <= q[None, :, :]).astype(floatX)
hs.append(h)
ys = [y[None, :, :]] + hs[:-1]
p_ys = [model.p_y_given_h(h, l, *params) for l, h in enumerate(hs)]
log_ph = -model.prior.step_neg_log_prob(hs[-1], *prior_params)
log_py_h = T.constant(0.).astype(floatX)
log_qh = T.constant(0.).astype(floatX)
for l in xrange(model.n_layers):
log_py_h += -model.conditionals[l].neg_log_prob(ys[l], p_ys[l])
log_qh += -model.posteriors[l].neg_log_prob(hs[l], qs[l][None, :, :])
log_p = log_py_h + log_ph - log_qh
w_tilde = get_w_tilde(log_p)
cost = -log_p.mean()
for q, h in zip(qs, hs):
q_ = (w_tilde[:, :, None] * h).sum(axis=0)
new_qs.append(self.inference_rate * q_ + (1 - self.inference_rate) * q)
return tuple(new_qs) + (cost,)
def params_infer(self):
return [T.constant(self.momentum).astype(floatX)]
def entropy(self):
return T.constant(0.).astype(floatX)
def get_L2_weight_cost(self, gamma, layers=None):
if layers is None:
layers = range(self.n_layers)
cost = T.constant(0.).astype(floatX)
for l in layers:
W = self.__dict__['W%d' % l]
cost += gamma * (W ** 2).sum()
return cost
def __init__(self, ntimes = False, n = TT.constant(0)):
"""
:type ntimes: bool
:param ntimes: If the last state needs to be repeated `n` times
:type n: int, theano constant, None
:param n: how many times the last state is repeated
"""
self.ntimes = ntimes
self.n = n
super(LastState, self).__init__(0, 0, None)
def const(value):
return TT.constant(numpy.asarray(value, dtype=theano.config.floatX))
def __init__(self, ntimes = False, n = TT.constant(0)):
"""
:type ntimes: bool
:param ntimes: If the last state needs to be repeated `n` times
:type n: int, theano constant, None
:param n: how many times the last state is repeated
"""
self.ntimes = ntimes
self.n = n
super(LastState, self).__init__(0, 0, None)
def const(value):
return TT.constant(numpy.asarray(value, dtype=theano.config.floatX))