def adamax(loss, params, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8):
grad_shared_flat, flat_grad, unflat_grads = flat_unflat_grads(loss, params)
grad_updates = [(grad_shared_flat, flat_grad)]
t_prev = theano.shared(np.array(0, dtype=theano.config.floatX))
one = T.constant(1)
t = t_prev + one
a_t = learning_rate / (one - beta1 ** t)
param_updates = list()
for p, g in zip(params, unflat_grads):
value = p.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
m_t = beta1 * m_prev + (one - beta1) * g
u_t = T.maximum(beta2 * u_prev, abs(g))
step = a_t * m_t / (u_t + epsilon)
param_updates += [(m_prev, m_t), (u_prev, u_t), (p, p - step)]
param_updates += [(t_prev, t)]
return grad_updates, param_updates, grad_shared_flat
评论列表
文章目录