def deepmind_rmsprop(loss_or_grads, params, learning_rate=0.00025,
rho=0.95, epsilon=0.01):
grads = get_or_compute_grads(loss_or_grads, params)
updates = OrderedDict()
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
acc_grad_new = rho * acc_grad + (1 - rho) * grad
acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2
updates[acc_grad] = acc_grad_new
updates[acc_rms] = acc_rms_new
updates[param] = (param - learning_rate *
(grad /
T.sqrt(acc_rms_new - acc_grad_new ** 2 + epsilon)))
return updates
python类get_or_compute_grads()的实例源码
def careful_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2):
"""
RMSProp with gradient clipping.
:param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled.
:return: updates
"""
grads = get_or_compute_grads(loss_or_grads, params)
updates = OrderedDict()
grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon)
# Using theano constant to prevent upcasting of float32
one = T.constant(1)
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
accu_new = rho * accu + (one - rho) * grad ** 2
updates[accu] = accu_new
updates[param] = param - (learning_rate * grad /
T.sqrt(accu_new + epsilon))
return updates
def hard_rmsprop(loss_or_grads, params, learning_rate = 1.0e-2, epsilon=1e-6):
"""
Not an actual RMSProp: just normalizes the gradient, so it norm equal to the `learning rate` parameter.
Don't use unless you have to.
:param loss_or_grads: loss to minimize
:param params: params to optimize
:param learning_rate: norm of the gradient
:param epsilon: small number for computational stability.
:return:
"""
grads = get_or_compute_grads(loss_or_grads, params)
gnorm = T.sqrt(sum(T.sum(g**2) for g in grads) + epsilon)
grads = [ g / gnorm for g in grads ]
updates = OrderedDict()
for param, grad in zip(params, grads):
updates[param] = param - learning_rate * grad
return updates
def cruel_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6,
grad_clipping=1.0e-2, param_clipping=1.0e-2):
"""
A version of careful RMSProp for Wassershtein GAN.
:param epsilon: small number for computational stability.
:param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled.
:param param_clipping: after each update all params are clipped to [-`param_clipping`, `param_clipping`].
:return:
"""
grads = get_or_compute_grads(loss_or_grads, params)
updates = OrderedDict()
grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon)
# Using theano constant to prevent upcasting of float32
one = T.constant(1)
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
accu_new = rho * accu + (one - rho) * grad ** 2
updates[accu] = accu_new
updated = param - (learning_rate * grad / T.sqrt(accu_new + epsilon))
if param_clipping is not None:
updates[param] = T.clip(updated, -param_clipping, param_clipping)
else:
updates[param] = updated
return updates
def deepmind_rmsprop(loss_or_grads, params, learning_rate,
rho, epsilon):
"""RMSProp updates [1]_.
Scale learning rates by dividing with the moving average of the root mean
squared (RMS) gradients.
Parameters
----------
loss_or_grads : symbolic expression or list of expressions
A scalar loss expression, or a list of gradient expressions
params : list of shared variables
The variables to generate update expressions for
learning_rate : float or symbolic scalar
The learning rate controlling the size of update steps
rho : float or symbolic scalar
Gradient moving average decay factor
epsilon : float or symbolic scalar
Small value added for numerical stability
Returns
-------
OrderedDict
A dictionary mapping each parameter to its update expression
Notes
-----
`rho` should be between 0 and 1. A value of `rho` close to 1 will decay the
moving average slowly and a value close to 0 will decay the moving average
fast.
Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the
learning rate :math:`\\eta_t` is calculated as:
.. math::
r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
\\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}}
References
----------
.. [1] Tieleman, T. and Hinton, G. (2012):
Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
"""
grads = get_or_compute_grads(loss_or_grads, params)
updates = OrderedDict()
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
acc_grad_new = rho * acc_grad + (1 - rho) * grad
acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2
updates[acc_grad] = acc_grad_new
updates[acc_rms] = acc_rms_new
updates[param] = (param - learning_rate *
(grad /
T.sqrt(acc_rms_new - acc_grad_new **2 + epsilon)))
return updates
def deepmind_rmsprop(loss_or_grads, params, learning_rate,
rho, epsilon):
"""RMSProp updates [1]_.
Scale learning rates by dividing with the moving average of the root mean
squared (RMS) gradients.
Parameters
----------
loss_or_grads : symbolic expression or list of expressions
A scalar loss expression, or a list of gradient expressions
params : list of shared variables
The variables to generate update expressions for
learning_rate : float or symbolic scalar
The learning rate controlling the size of update steps
rho : float or symbolic scalar
Gradient moving average decay factor
epsilon : float or symbolic scalar
Small value added for numerical stability
Returns
-------
OrderedDict
A dictionary mapping each parameter to its update expression
Notes
-----
`rho` should be between 0 and 1. A value of `rho` close to 1 will decay the
moving average slowly and a value close to 0 will decay the moving average
fast.
Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the
learning rate :math:`\\eta_t` is calculated as:
.. math::
r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
\\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}}
References
----------
.. [1] Tieleman, T. and Hinton, G. (2012):
Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
"""
grads = get_or_compute_grads(loss_or_grads, params)
updates = OrderedDict()
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
acc_grad_new = rho * acc_grad + (1 - rho) * grad
acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2
updates[acc_grad] = acc_grad_new
updates[acc_rms] = acc_rms_new
updates[param] = (param - learning_rate *
(grad /
T.sqrt(acc_rms_new - acc_grad_new **2 + epsilon)))
return updates