def dot_2d(k, M, b=None, g=None):
# k: (nb_samples, memory_width)
# M: (nb_samples, memory_dim, memory_width)
# norms of keys and memories
# k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
# M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
k = k[:, None, :] # (nb_samples, 1, memory_width)
value = k * M
if b is not None:
b = b[:, None, :]
value *= b # (nb_samples, memory_dim,)
if g is not None:
g = g[None, None, :]
value *= g
sim = T.sum(value, axis=2)
return sim
python类sqr()的实例源码
def op_cosine_c(
s_xr_, s_xi_, s_yr_, s_yi_, axis_=-1, keepdims_=True, eps_=1e-7):
'''
cosine between two complex vectors, uses standard complex inner product
Args:
s_xr_: real part of x
s_xi_: imag part of x
s_yr_: real part of y
s_yi_: imag part of y
eps_: small number to prevent divide by zero
'''
s_nrm = s_xr_*s_yr_ + s_xi_*s_yi_
s_nx = T.sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_)
s_ny = T.sum(T.sqr(s_yr_) + T.sqr(s_yi_), axis=axis_, keepdims=keepdims_)
return T.sum(s_nrm, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_)
def op_ortho_loss(s_x_, axes_=(-2, -1), ndim_=None):
'''
orthogoal matrix loss
used to regularize parameter to unitary
Args:
s_x_: (batch of) matrices
axes_: tuple of two integers, specify which axes to be for matrix,
defaults to last two axes
ndim_: specify args to be (ndim_ x ndim_) matrices
'''
if ndim_ is None:
ax = axes_[0]
ndim = T.shape(s_x_)[ax]
else:
ndim = ndim_
tpat = list(range(ndim))
bpat = ['x'] * s_x_.ndim
tpat[axes_[0]], tpat[axes_[1]] = tpat[axes_[1]], tpat[axes_[0]]
bpat[axes_[0]] = 0
bpat[axes_[1]] = 1
s_y = T.dot(s_x_.transpose(*tpat), s_x_)
return T.sqr(s_y - T.eye(ndim).dimshuffle(*bpat))
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum):
has_momentum = momentum.get_value() > 0.0
samples = [ default_mrng.normal(size=p.shape, avg=0, std=1,
dtype=theano.config.floatX) for p in params ]
HVs = T.Lop(gparams, params, samples)
i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
i_t = i + 1.0
omg_t = 1.0 - gamma**i_t
for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs):
if is_subtensor_op(p):
raise Exception("ESGD subtensor update not implemented!")
else:
D_t = D * gamma + T.sqr(Hv) * (1.0-gamma)
if has_momentum:
m_t = m*momentum + g
updates[m] = m_t
else:
m_t = g
g_t = m_t / ( T.sqrt(D_t/omg_t + eps) )
#g_t = m_t / ( T.sqrt(D_t + eps) )
updates[D] = D_t
updates[p] = p - lr*g_t
updates[i] = i_t
def dot_2d(k, M, b=None, g=None):
# k: (nb_samples, memory_width)
# M: (nb_samples, memory_dim, memory_width)
# norms of keys and memories
# k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
# M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
k = k[:, None, :] # (nb_samples, 1, memory_width)
value = k * M
if b is not None:
b = b[:, None, :]
value *= b # (nb_samples, memory_dim,)
if g is not None:
g = g[None, None, :]
value *= g
sim = T.sum(value, axis=2)
return sim
def Adam(cost, params, learning_rate=0.0002, b1=0.1, b2=0.001, e=1e-8):
updates = OrderedDict()
grads = T.grad(cost, params)
i = theano.shared(np.asarray(0., dtype=theano.config.floatX))
i_t = i + 1.
fix1 = 1. - (1. - b1)**i_t
fix2 = 1. - (1. - b2)**i_t
lr_t = learning_rate * (T.sqrt(fix2) / fix1)
for p, g in zip(params, grads):
m = theano.shared(p.get_value() * 0.)
v = theano.shared(p.get_value() * 0.)
m_t = (b1 * g) + ((1. - b1) * m)
v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
g_t = m_t / (T.sqrt(v_t) + e)
p_t = p - (lr_t * g_t)
updates[m] = m_t
updates[v] = v_t
updates[p] = p_t
updates[i] = i_t
return updates
def adam(cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
updates = []
grads = T.grad(cost, params)
i = theano.shared(np.dtype(theano.config.floatX).type(1))
i_t = i + 1.
fix1 = 1. - (1. - b1)**i_t
fix2 = 1. - (1. - b2)**i_t
lr_t = lr * (T.sqrt(fix2) / fix1)
for p, g in zip(params, grads):
g = T.clip(g, -grad_clip, grad_clip)
m = theano.shared(p.get_value() * 0.)
v = theano.shared(p.get_value() * 0.)
m_t = (b1 * g) + ((1. - b1) * m)
v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
g_t = m_t / (T.sqrt(v_t) + e)
p_t = p - (lr_t * g_t)
updates.append((m, m_t))
updates.append((v, v_t))
updates.append((p, p_t))
updates.append((i, i_t))
return updates
def adam(cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, **kwargs):
"""Adam Gradient Descent
Scale learning rates by Adaptive moment estimation
References
----------
.. [1] https://arxiv.org/pdf/1412.6980v8.pdf
"""
gparams = T.grad(cost, params)
updates = OrderedDict()
t = shared_variable(to_float_X(0.))
t_t = 1. + t
l_r_t = learning_rate * T.sqrt(1. - beta2 ** t_t) / (1. - beta1 ** t_t)
for param, gparam in zip(params, gparams):
m = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
v = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
m_t = beta1 * m + (1. - beta1) * gparam
v_t = beta2 * v + (1. - beta2) * T.sqr(gparam)
updates[m] = m_t
updates[v] = v_t
updates[param] = param - l_r_t * m_t / (T.sqrt(v_t) + epsilon)
updates[t] = t_t
return updates
def __call__(self, c01b):
"""
.. todo::
WRITEME
"""
half = self.n // 2
sq = T.sqr(c01b)
ch, r, c, b = c01b.shape
extra_channels = T.alloc(0., ch + 2*half, r, c, b)
sq = T.set_subtensor(extra_channels[half:half+ch,:,:,:], sq)
scale = self.k
for i in xrange(self.n):
scale += self.alpha * sq[i:i+ch,:,:,:]
scale = scale ** self.beta
return c01b / scale
def __call__(self, c01b):
"""
.. todo::
WRITEME
"""
half = self.n // 2
sq = T.sqr(c01b)
ch, r, c, b = c01b.shape
extra_channels = T.alloc(0., ch + 2*half, r, c, b)
sq = T.set_subtensor(extra_channels[half:half+ch,:,:,:], sq)
scale = self.k
for i in xrange(self.n):
scale += self.alpha * sq[i:i+ch,:,:,:]
scale = scale ** self.beta
return c01b / scale
rcnn_class.py 文件源码
项目:Recurrent-Convolutional-Neural-Network
作者: monisjaved
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def get_cost(self, X, Y, X_sizes):
"""
Calculates cost for each values in mini batch, also
regularizes all the input parameters and then returns
final cost function as theano variable
"""
cost_fn, _ = theano.scan(
fn=self.get_likelihood,
sequences=[X, Y, X_sizes]
)
cost_fn = cost_fn.mean()
cost_fn += self.reg_lambda * T.sqr(self.W_c_r).sum() / 2.
cost_fn += self.reg_lambda * T.sqr(self.W_c_l).sum() / 2.
cost_fn += self.reg_lambda * T.sqr(self.W_conv).sum() / 2.
cost_fn += self.reg_lambda * T.sqr(self.W_output).sum() / 2.
cost_fn += self.reg_lambda * T.sqr(self.b_output).sum() / 2.
# Regularizing word embedding
cost_fn += self.reg_lambda * T.sqr(self.vector_dict).sum() / 2
return cost_fn
def define_loss(self):
#Inverse since those that have a smaller distance are the most probable.
self.pred_func = TT.nnet.sigmoid( TT.sum(self.e1[self.rows,:] * self.r1[self.cols,:] * self.e1[self.tubes,:], 1) \
+ TT.sum(self.e2[self.rows,:] * self.r1[self.cols,:] * self.e2[self.tubes,:], 1) \
+ TT.sum(self.e1[self.rows,:] * self.r2[self.cols,:] * self.e2[self.tubes,:], 1) \
- TT.sum(self.e2[self.rows,:] * self.r2[self.cols,:] * self.e1[self.tubes,:], 1) )
self.loss = TT.nnet.softplus( - self.ys * ( TT.sum(self.e1[self.rows,:] * self.r1[self.cols,:] * self.e1[self.tubes,:], 1) \
+ TT.sum(self.e2[self.rows,:] * self.r1[self.cols,:] * self.e2[self.tubes,:], 1) \
+ TT.sum(self.e1[self.rows,:] * self.r2[self.cols,:] * self.e2[self.tubes,:], 1) \
- TT.sum(self.e2[self.rows,:] * self.r2[self.cols,:] * self.e1[self.tubes,:], 1) )).mean()
self.regul_func = TT.sqr(self.e1[self.rows,:]).mean() \
+ TT.sqr(self.e2[self.rows,:]).mean() \
+ TT.sqr(self.e1[self.tubes,:]).mean() \
+ TT.sqr(self.e2[self.tubes,:]).mean() \
+ TT.sqr(self.r1[self.cols,:]).mean() \
+ TT.sqr(self.r2[self.cols,:]).mean()
def fit(self, weights, o_error, tpo ):
gradients = T.grad(o_error ,weights)
updates = []
for c, v, w, g in zip(self.t_cache, self.t_velocity, weights,gradients):
new_velocity = T.sub( T.mul(tpo["momentum_rate"], v) , T.mul(tpo["learn_rate"], g) )
new_cache = T.add( T.mul(tpo["decay_rate"] , c) , T.mul(T.sub( 1, tpo["decay_rate"]) , T.sqr(g)))
new_weights = T.sub(T.add(w , new_velocity) , T.true_div( T.mul(g,tpo["learn_rate"]) , T.sqrt(T.add(new_cache,0.1**8))))
updates.append((w, new_weights))
updates.append((v, new_velocity))
updates.append((c, new_cache))
return updates
###### Nesterov momentum
########################################
def fit(self, weights, o_error, tpo):
updates = []
gradients = theano.grad(o_error, weights)
for c, w, g in zip(self.t_cache, weights, gradients):
new_cache = tpo["decay_rate"] * c + ( 1- tpo["decay_rate"]) * T.sqr(g)
new_weights = w - (g * tpo["learn_rate"]) / T.sqrt(new_cache + 0.1**8)
updates.append((w, new_weights))
updates.append((c, new_cache))
return updates
###### ADADELTA
########################################
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x,
deterministic, binary, L):
layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list
+ l_x_list + [l_x], deterministic=deterministic)
z_mu = layer_outputs[0]
z_ls = layer_outputs[1]
x_mu = [] if binary else layer_outputs[2:2+L]
x_ls = [] if binary else layer_outputs[2+L:2+2*L]
x_list = layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L]
x = layer_outputs[-1]
kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls))
if binary:
logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum()
for x in x_list) * (-1./L)
prediction = x_list[0] if deterministic else x
else:
logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
for mu, ls in zip(x_mu, x_ls))/L
prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L
loss = -1 * (logpxz + kl_div)
return loss, prediction
def sym_logdensity(self, x):
""" x is a matrix of column datapoints (VxB) V = n_visible, B = batch size """
def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev):
a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1))
h = self.nonlinearity(a * activations_factor) # BxH
Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC
Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC
Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC
p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) + T.log(Alpha))
return (p, a, x)
# First element is different (it is predicted from the bias only)
a0 = T.zeros_like(T.dot(x.T, self.W)) # BxH
p0 = T.zeros_like(x[0])
x0 = T.ones_like(x[0])
([ps, _as, _xs], updates) = theano.scan(density_given_previous_a_and_x,
sequences=[x, self.W, self.V_alpha, self.b_alpha, self.V_mu, self.b_mu, self.V_sigma, self.b_sigma, self.activation_rescaling],
outputs_info=[p0, a0, x0])
return (ps[-1], updates)
def adam(params, grads, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
updates = OrderedDict()
i = theano.shared(np.float32(0))
i_t = i + 1.
for p, g in zip(params, grads):
v = build_shared_zeros(p.get_value(True).shape)
r = build_shared_zeros(p.get_value(True).shape)
v_t = (b1 * v) + (1. - b1) * g
r_t = (b2 * r) + (1. - b2) * T.sqr(g)
r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e)
v_hat = v / (1 - b1 ** i_t)
p_t = p - r_hat * v_hat
updates[v] = v_t
updates[r] = r_t
updates[p] = p_t
updates[i] = i_t
return updates
def gradients_to_updates(self, params, grads):
updates = OrderedDict()
for pp, gg in zip(params, grads):
value = pp.get_value(borrow=True)
self.accu = theano.shared(np.zeros(value.shape, dtype=theano.config.floatX), 'adadelta_accu_'+pp.name)
self.delta_accu = theano.shared(np.zeros(value.shape, dtype=theano.config.floatX), 'adadelta_delta_accu_'+pp.name)
self.params.append(self.accu)
self.params.append(self.delta_accu)
self.accu.tags = ['optimizer_param']
self.delta_accu.tags = ['optimizer_param']
accu_new = self.rho * self.accu + (1 - self.rho) * T.sqr(gg)
updates[self.accu] = accu_new
ud = gg * (T.sqrt(self.delta_accu) + 1e-7) / (T.sqrt(accu_new) + 1e-7)
updates[pp] = pp - self.lr * ud
delta_accu_new = self.rho * self.delta_accu + (1 - self.rho) * T.sqr(ud)
updates[self.delta_accu] = delta_accu_new
return updates
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum):
has_momentum = momentum.get_value() > 0.0
samples = [ default_mrng.normal(size=p.shape, avg=0, std=1,
dtype=theano.config.floatX) for p in params ]
HVs = T.Lop(gparams, params, samples)
i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
i_t = i + 1.0
omg_t = 1.0 - gamma**i_t
for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs):
if is_subtensor_op(p):
raise Exception("ESGD subtensor update not implemented!")
else:
D_t = D * gamma + T.sqr(Hv) * (1.0-gamma)
if has_momentum:
m_t = m*momentum + g
updates[m] = m_t
else:
m_t = g
g_t = m_t / ( T.sqrt(D_t/omg_t + eps) )
#g_t = m_t / ( T.sqrt(D_t + eps) )
updates[D] = D_t
updates[p] = p - lr*g_t
updates[i] = i_t
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x,
deterministic, binary, L):
layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list
+ l_x_list + [l_x], deterministic=deterministic)
z_mu = layer_outputs[0]
z_ls = layer_outputs[1]
x_mu = [] if binary else layer_outputs[2:2+L]
x_ls = [] if binary else layer_outputs[2+L:2+2*L]
x_list = layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L]
x = layer_outputs[-1]
kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls))
if binary:
logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum()
for x in x_list) * (-1./L)
prediction = x_list[0] if deterministic else x
else:
logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
for mu, ls in zip(x_mu, x_ls))/L
prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L
loss = -1 * (logpxz + kl_div)
return loss, prediction
def RMSProp(self, learning_rate=0.01, decay=0.9, epsilon=1.0 / 100.):
"""
RMSProp of Tieleman et al.
:param learning_rate: learning rate
:param decay: decay rate of gradient history
:param epsilon: gradient clip
:return: update
"""
for param_i, grad_i in zip(self.params, self.grads):
# Accumulate gradient
msg = theano.shared(numpy.zeros(param_i.get_value().shape, dtype=theano.config.floatX))
self.shared.append(msg)
new_mean_squared_grad = (decay * msg + (1 - decay) * T.sqr(grad_i))
# Compute update
rms_grad_t = T.sqrt(new_mean_squared_grad)
rms_grad_t = T.maximum(rms_grad_t, epsilon)
delta_x_t = -learning_rate * grad_i / rms_grad_t
# Apply update
self.updates.append((param_i, param_i + delta_x_t))
self.updates.append((msg, new_mean_squared_grad))
return self.updates
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x,
deterministic, binary, L):
layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list
+ l_x_list + [l_x], deterministic=deterministic)
z_mu = layer_outputs[0]
z_ls = layer_outputs[1]
x_mu = [] if binary else layer_outputs[2:2+L]
x_ls = [] if binary else layer_outputs[2+L:2+2*L]
x_list = layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L]
x = layer_outputs[-1]
kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls))
if binary:
logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum()
for x in x_list) * (-1./L)
prediction = x_list[0] if deterministic else x
else:
logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
for mu, ls in zip(x_mu, x_ls))/L
prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L
loss = -1 * (logpxz + kl_div)
return loss, prediction
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x,
deterministic, binary, L):
layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list
+ l_x_list + [l_x], deterministic=deterministic)
z_mu = layer_outputs[0]
z_ls = layer_outputs[1]
x_mu = [] if binary else layer_outputs[2:2+L]
x_ls = [] if binary else layer_outputs[2+L:2+2*L]
x_list = layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L]
x = layer_outputs[-1]
kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls))
if binary:
logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum()
for x in x_list) * (-1./L)
prediction = x_list[0] if deterministic else x
else:
logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
for mu, ls in zip(x_mu, x_ls))/L
prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L
loss = -1 * (logpxz + kl_div)
return loss, prediction
def gradient_descent(self, loss):
"""Momentum GD with gradient clipping."""
grad = T.grad(loss, self.params)
self.momentum_velocity_ = [0.] * len(grad)
grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad)))
updates = OrderedDict()
not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
scaling_den = T.maximum(5.0, grad_norm)
for n, (param, grad) in enumerate(zip(self.params, grad)):
grad = T.switch(not_finite, 0.1 * param,
grad * (5.0 / scaling_den))
velocity = self.momentum_velocity_[n]
update_step = self.momentum * velocity - self.learning_rate * grad
self.momentum_velocity_[n] = update_step
updates[param] = param + update_step
return updates
def Adam(self, params, cost, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
updates = []
grads = T.grad(cost, params)
i = theano.shared(as_floatX(0.))
i_t = i + 1.
fix1 = 1. - (1. - b1)**i_t
fix2 = 1. - (1. - b2)**i_t
lr_t = lr * (T.sqrt(fix2) / fix1)
for p, g in zip(params, grads):
m = theano.shared(p.get_value() * 0.)
v = theano.shared(p.get_value() * 0.)
m_t = (b1 * g) + ((1. - b1) * m)
v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
g_t = m_t / (T.sqrt(v_t) + e)
p_t = p - (lr_t * g_t)
updates.append((m, m_t))
updates.append((v, v_t))
updates.append((p, p_t))
updates.append((i, i_t))
return updates
def Adam(grads, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
updates = []
varlist = []
i = sharedX(0.)
i_t = i + 1.
fix1 = 1. - (1. - b1)**i_t
fix2 = 1. - (1. - b2)**i_t
lr_t = lr * (T.sqrt(fix2) / fix1)
for p, g in grads.items():
m = sharedX(p.get_value() * 0., name=p.name + '_adam_optimizer_m')
v = sharedX(p.get_value() * 0., name=p.name + '_adam_optimizer_v')
m_t = (b1 * g) + ((1. - b1) * m)
v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
g_t = m_t / (T.sqrt(v_t) + e)
p_t = p - (lr_t * g_t)
updates.append((m, m_t))
updates.append((v, v_t))
updates.append((p, p_t))
varlist.append(m)
varlist.append(v)
updates.append((i, i_t))
return updates, varlist
def Adagrad(grads, lr):
updates = OrderedDict()
for param in grads.keys():
# sum_square_grad := \sum g^2
sum_square_grad = sharedX(param.get_value() * 0.)
if param.name is not None:
sum_square_grad.name = 'sum_square_grad_' + param.name
# Accumulate gradient
new_sum_squared_grad = sum_square_grad + T.sqr(grads[param])
# Compute update
delta_x_t = (- lr / T.sqrt(numpy.float32(1e-5) + new_sum_squared_grad)) * grads[param]
# Apply update
updates[sum_square_grad] = new_sum_squared_grad
updates[param] = param + delta_x_t
return updates
def dot_2d(k, M, b=None, g=None):
# k: (nb_samples, memory_width)
# M: (nb_samples, memory_dim, memory_width)
# norms of keys and memories
# k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
# M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
k = k[:, None, :] # (nb_samples, 1, memory_width)
value = k * M
if b is not None:
b = b[:, None, :]
value *= b # (nb_samples, memory_dim,)
if g is not None:
g = g[None, None, :]
value *= g
sim = T.sum(value, axis=2)
return sim
def get_adam_updates(f, params, lr=10., b1=0.9, b2=0.999, e=1e-8, dec=5e-3, norm_grads=False):
"""Generate updates to optimize using the Adam optimizer with linear learning rate decay."""
t = theano.shared(0)
ms = [theano.shared(np.zeros(param.shape.eval(), dtype=floatX), borrow=True) for param in params]
vs = [theano.shared(np.zeros(param.shape.eval(), dtype=floatX), borrow=True) for param in params]
gs = T.grad(f, params)
if norm_grads:
gs = [g / (T.sum(T.abs_(g)) + 1e-8) for g in gs]
t_u = (t, t + 1)
m_us = [(m, b1 * m + (1. - b1) * g) for m, g in zip(ms, gs)]
v_us = [(v, b2 * v + (1. - b2) * T.sqr(g)) for v, g in zip(vs, gs)]
t_u_f = T.cast(t_u[1], floatX)
lr_hat = (lr / (1. + t_u_f * dec)) * T.sqrt(1. - T.pow(b2, t_u_f)) / (1. - T.pow(b1, t_u_f))
param_us = [(param, param - lr_hat * m_u[1] / (T.sqrt(v_u[1]) + e)) for m_u, v_u, param in zip(m_us, v_us, params)]
return m_us + v_us + param_us + [t_u]
def Adam(grads, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
updates = []
varlist = []
i = sharedX(0.)
i_t = i + 1.
fix1 = 1. - (1. - b1)**i_t
fix2 = 1. - (1. - b2)**i_t
lr_t = lr * (T.sqrt(fix2) / fix1)
for p, g in grads.items():
m = sharedX(p.get_value() * 0., name=p.name + '_adam_optimizer_m')
v = sharedX(p.get_value() * 0., name=p.name + '_adam_optimizer_v')
m_t = (b1 * g) + ((1. - b1) * m)
v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
g_t = m_t / (T.sqrt(v_t) + e)
p_t = p - (lr_t * g_t)
updates.append((m, m_t))
updates.append((v, v_t))
updates.append((p, p_t))
varlist.append(m)
varlist.append(v)
updates.append((i, i_t))
return updates, varlist