def deepmind_rmsprop(loss_or_grads, params, learning_rate,
rho, epsilon):
"""RMSProp updates [1]_.
Scale learning rates by dividing with the moving average of the root mean
squared (RMS) gradients.
Parameters
----------
loss_or_grads : symbolic expression or list of expressions
A scalar loss expression, or a list of gradient expressions
params : list of shared variables
The variables to generate update expressions for
learning_rate : float or symbolic scalar
The learning rate controlling the size of update steps
rho : float or symbolic scalar
Gradient moving average decay factor
epsilon : float or symbolic scalar
Small value added for numerical stability
Returns
-------
OrderedDict
A dictionary mapping each parameter to its update expression
Notes
-----
`rho` should be between 0 and 1. A value of `rho` close to 1 will decay the
moving average slowly and a value close to 0 will decay the moving average
fast.
Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the
learning rate :math:`\\eta_t` is calculated as:
.. math::
r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
\\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}}
References
----------
.. [1] Tieleman, T. and Hinton, G. (2012):
Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
"""
grads = get_or_compute_grads(loss_or_grads, params)
updates = OrderedDict()
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
acc_grad_new = rho * acc_grad + (1 - rho) * grad
acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2
updates[acc_grad] = acc_grad_new
updates[acc_rms] = acc_rms_new
updates[param] = (param - learning_rate *
(grad /
T.sqrt(acc_rms_new - acc_grad_new **2 + epsilon)))
return updates
评论列表
文章目录