def Adagrad(grads, lr):
updates = OrderedDict()
for param in grads.keys():
# sum_square_grad := \sum g^2
sum_square_grad = sharedX(param.get_value() * 0.)
if param.name is not None:
sum_square_grad.name = 'sum_square_grad_' + param.name
# Accumulate gradient
new_sum_squared_grad = sum_square_grad + T.sqr(grads[param])
# Compute update
delta_x_t = (- lr / T.sqrt(numpy.float32(1e-5) + new_sum_squared_grad)) * grads[param]
# Apply update
updates[sum_square_grad] = new_sum_squared_grad
updates[param] = param + delta_x_t
return updates
python类sqr()的实例源码
def op_l2norm(s_x_, eps_=1e-6):
return T.sqrt(eps_+T.sum(T.sqr(s_x_)))
def op_cosine(s_u_, s_v_, flatten_=True, eps_=1e-6):
if flatten_:
s_u = s_u_.flatten()
s_v = s_v_.flatten()
return T.dot(s_u, s_v) / T.sqrt(eps_+T.sum(T.sqr(s_u))*T.sum(T.sqr(s_v)))
else:
s_u = s_u_
s_v = s_v_
T.sum(s_u*s_v, axis=-1)/T.sqrt(eps_+T.sum(T.sqr(s_u), axis=-1)*T.sum(T.sqr(s_v), axis=-1))
def gradient_clipping(gradients, max_norm=5.0):
global_grad_norm = tensor.sqrt(sum(map(lambda x: tensor.sqr(x).sum(), gradients)))
multiplier = tensor.switch(global_grad_norm < max_norm, 1.0, max_norm / global_grad_norm)
return [g * multiplier for g in gradients]
def RMSProp(self, learning_rate=0.01, decay=0.9, epsilon=1.0 / 100.):
"""
RMSProp of Tieleman et al.
:param learning_rate: learning rate
:param decay: decay rate of gradient history
:param epsilon: gradient clip
:return: update
"""
updates = []
for param_i, grad_i in zip(self.params, self.grads):
# Accumulate gradient
msg = theano.shared(numpy.zeros(param_i.get_value().shape, dtype=theano.config.floatX))
new_mean_squared_grad = (decay * msg + (1 - decay) * T.sqr(grad_i))
# Compute update
rms_grad_t = T.sqrt(new_mean_squared_grad)
rms_grad_t = T.maximum(rms_grad_t, epsilon)
delta_x_t = -learning_rate * grad_i / rms_grad_t
# Apply update
updates.append((param_i, param_i + delta_x_t))
updates.append((msg, new_mean_squared_grad))
return updates
def mean_squared_error(y_true, y_pred):
return T.sqr(y_pred - y_true).mean(axis=-1)
def mean_squared_logarithmic_error(y_true, y_pred):
return T.sqr(T.log(T.clip(y_pred, epsilon, np.inf) + 1.) - T.log(T.clip(y_true, epsilon, np.inf) + 1.)).mean(axis=-1)
def squared_hinge(y_true, y_pred):
return T.sqr(T.maximum(1. - y_true * y_pred, 0.)).mean(axis=-1)
def cosine_sim2d(k, M):
# k: (nb_samples, memory_width)
# M: (nb_samples, memory_dim, memory_width)
# norms of keys and memories
k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
k = k[:, None, :] # (nb_samples, 1, memory_width)
k_norm = k_norm[:, None] # (nb_samples, 1)
sim = T.sum(k * M, axis=2) # (nb_samples, memory_dim,)
sim /= k_norm * M_norm # (nb_samples, memory_dim,)
return sim
def op_sqr_c(s_xr_, s_xi_):
'''
elemwise complex square
'''
return T.sqr(s_xr_) - T.sqr(s_xi_), 2*s_xr_*s_xi_
def op_norm2(s_x_, axis_=-1, use_mean_=False, keepdims_=True):
'''
Square of L2 norm
Args:
s_x_: input (batch of) vector
axis_: int or tuple of int
use_mean_: cause mean of square to be one instead of sum
'''
op_sum = T.sum if not use_mean_ else T.mean
return op_sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_)
def op_norm2_c(s_xr_, s_xi_, axis_=-1, use_mean_=False, keepdims_=True):
'''
Complex squared L2 norm
'''
op_sum = T.sum if not use_mean_ else T.mean
return op_sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_)
def op_cosine(s_x_, s_y_, axis_=-1, keepdims_=True, eps_=1e-7):
'''
cosine between two vectors
'''
s_prod = s_x_ * s_y_
s_nx = T.sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_)
s_ny = T.sum(T.sqr(s_y_), axis=axis_, keepdims=keepdims_)
return (T.sum(s_prod, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_))
def op_sqr_cosine(s_x_, s_y_, axis_=-1, keepdims_=True, eps_=1e-7):
'''
squared cosine
for some occasion, sqrt is not needed
'''
s_prod = s_x_ * s_y_
s_nx = T.sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_)
s_ny = T.sum(T.sqr(s_y_), axis=axis_, keepdims=keepdims_)
return (T.sqr(T.sum(s_prod, axis=axis_, keepdims=keepdims_)) / (s_nx * s_ny + eps_))
def op_unitary_loss(s_re_, s_im_, axes_=None, size_=None):
'''
unitary matrix loss of real/imag part,
used to regularize parameter to unitary
Args:
s_re_: real part, square matrix
s_im_: imag part, square matrix
size_: specify args to be (size_ x size_) matrices
axes_: tuple of two integers, specify which axes to be for matrix,
defaults to last two axes
'''
if axes_ is None:
axes_ = (-2, -1)
if size_ is None:
ax = axes_[0]
size = T.shape(s_re_)[ax]
else:
size = size_
assert s_re_.ndim == s_im_.ndim
tpat = list(range(s_re_.ndim))
bpat = ['x'] * s_re_.ndim
tpat[axes_[0]], tpat[axes_[1]] = tpat[axes_[1]], tpat[axes_[0]]
bpat[axes_[0]] = 0
bpat[axes_[1]] = 1
s_y_re_ = T.dot(s_re_.transpose(*tpat), s_re_) + T.dot(s_im_.transpose(*tpat), s_im_)
s_tmp = T.dot(s_re_.transpose(*tpat), s_im_)
s_y_im_ = s_tmp - s_tmp.transpose(*tpat)
return T.mean(T.sqr(s_y_re_ - T.eye(size).dimshuffle(*bpat)) + T.sqr(s_y_im_))
def get_updates(self, learning_rate, params, grads, lr_scalers):
"""Compute the AdaDelta updates of the model's parameters.
param_t := param_(t-1) + AdaDelta_update_t
"""
if self._first_time:
self.sum_square_grad = [
sharedX_mtx(
param.get_value() * 0.,
name='sum_square_grad_'+param.name,
borrow=True) for param in params]
self._first_time = False
updates = []
for (param, grad, sum_square_grad, lr_sc) in zip(
params, grads, self.sum_square_grad, lr_scalers):
# Calculate the running average gradient: E[g^2]_t
new_sum_square_grad = sum_square_grad + T.sqr(grad)
# The update: delta_x_t
lr_scaled = learning_rate * lr_sc
epsilon = lr_scaled
sqrt_sum_grad_t = T.sqrt(new_sum_square_grad)
delta_x_t = - (epsilon / sqrt_sum_grad_t) * grad
# update the params
new_param = param + delta_x_t
# Send for the update
updates.append((sum_square_grad, new_sum_square_grad))
if self.max_colm_norm and param.name in ["W", "w"]:
new_param_final = norm_constraint(tensor_var=new_param,
max_norm=self.max_norm)
else:
new_param_final = new_param
updates.append((param, new_param_final))
return updates
def get_updates(self, learning_rate, params, grads, lr_scalers):
"""Compute the parameters' updates.
"""
if self._first_time:
self.mean_square_grads = [
sharedX_mtx(
param.get_value() * 0.,
name='mean_square_grad_'+param.name,
borrow=True) for param in params]
self._first_time = False
updates = []
for (param, grad, mean_square_grad, lr_sc) in zip(
params, grads, self.mean_square_grads, lr_scalers):
new_mean_square_grad = (
self.decay * mean_square_grad + (1-self.decay) * T.sqr(grad))
# the update
rms_grad_t = T.sqrt(new_mean_square_grad)
rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
lr_scaled = learning_rate * lr_sc
delta_x_t = - lr_scaled * grad / rms_grad_t
new_param = param + delta_x_t
# updates
if self.max_colm_norm and param.name in ["W", "w"]:
new_param_final = norm_constraint(tensor_var=new_param,
max_norm=self.max_norm)
else:
new_param_final = new_param
updates.append((param, new_param_final))
updates.append((mean_square_grad, new_mean_square_grad))
return updates
def localResponseNormalizationCrossChannel(incoming, alpha=1e-4,
k=2, beta=0.75, n=5):
"""
Implement the local response normalization cross the channels described
in <ImageNet Classification with Deep Convolutional Neural Networks>,
A.Krizhevsky et al. sec.3.3.
Reference of the code:
https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/
normalization.py
https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/expr/normalize.py
Parameters:
incomping: The feature maps. (output of the convolution layer).
alpha: float scalar
k: float scalr
beta: float scalar
n: integer: number of adjacent channels. Must be odd.
"""
if n % 2 == 0:
raise NotImplementedError("Works only with odd n")
input_shape = incoming.shape
half_n = n // 2
input_sqr = T.sqr(incoming)
b, ch, r, c = input_shape
extra_channels = T.alloc(0., b, ch + 2*half_n, r, c)
input_sqr = T.set_subtensor(extra_channels[:, half_n:half_n+ch, :, :],
input_sqr)
scale = k
for i in range(n):
scale += alpha * input_sqr[:, i:i+ch, :, :]
scale = scale ** beta
return incoming / scale
def contractive_penality(self, h, linear_hid, contraction_level=0.0,
batch_size=-1):
if batch_size == -1 or batch_size == 0:
raise Exception("invalid batch size.")
grad = T.grad(h.sum(), linear_hid)
jacob = T.dot(T.sqr(grad), T.sqr(self.hidden.W.sum(axis=0)))
frob_norm_jacob = T.sum(jacob) / batch_size
contract_pen = contraction_level * frob_norm_jacob
return contract_pen
def get_net_cost(model, cost_type, eye=True):
"""Get the train cost of the network."""
cost = None
if eye:
d_eyes = (
(model.trg[:, 37] - model.trg[:, 46])**2 +
(model.trg[:, 37] - model.trg[:, 46])**2).T
if cost_type == CostType.MeanSquared:
cost = T.mean(
T.sqr(model.output_dropout - model.trg), axis=1) / d_eyes
elif cost_type == CostType.CrossEntropy:
cost = T.mean(
T.nnet.binary_crossentropy(
model.output_dropout, model.trg), axis=1)
else:
raise ValueError("cost type unknow.")
else:
if cost_type == CostType.MeanSquared:
cost = T.mean(
T.sqr(model.output_dropout - model.trg), axis=1)
elif cost_type == CostType.CrossEntropy:
cost = T.mean(
T.nnet.binary_crossentropy(
model.output_dropout, model.trg), axis=1)
else:
raise ValueError("cost type unknow.")
if model.l1 != 0.:
cost += model.l1
if model.l2 != 0.:
cost += model.l2
return cost