def adastep(
inputs, loss, params, outputs=(),
max_iter=8, rho = 0.9, momentum=None,
initial_learning_rate = 1.0e-3, max_learning_rate=1.0, max_delta = 1.0e-1, eps=1.0e-6):
cache_inputs, cache_grads, get_loss, set_params = grad_base(
inputs, loss, params, outputs, norm_gradients=False, momentum=momentum
)
one = T.constant(1.0, dtype='float32')
v = theano.shared(
np.float32(initial_learning_rate), name = 'v'
)
new_v = T.fscalar()
upd_v = OrderedDict()
upd_v[v] = v * rho + new_v * (one - rho)
update_v = theano.function([new_v], v, updates=upd_v, no_default_updates=True)
get_v = theano.function([], v, no_default_updates=True)
return _adastep(
cache_inputs, cache_grads, get_loss, set_params,
get_v, update_v,
max_iter=max_iter,
max_learning_rate=max_learning_rate,
max_delta=max_delta,
eps=eps
)
评论列表
文章目录