def AdaMax2(w, objective, alpha=.01, beta1=.1, beta2=.001, n_accum=2):
print 'AdaMax2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2, 'n_accum:', n_accum
g = T.grad(objective.sum(), w, disconnected_inputs='warn')
new = OrderedDict()
from theano.ifelse import ifelse
it = G.sharedf(0.)
new[it] = it + 1
reset = T.eq(T.mod(new[it],n_accum), 0)
update = T.eq(T.mod(new[it],n_accum), n_accum-1)
for i in range(len(w)):
mom1 = G.sharedf(w[i].get_value() * 0.)
_max = G.sharedf(w[i].get_value() * 0.)
g_sum = G.sharedf(w[i].get_value() * 0.)
#gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's
new[g_sum] = ifelse(reset, g[i], g_sum + g[i])
new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1)
new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max)
new[w[i]] = ifelse(update, w[i] + alpha * new[mom1] / new[_max], w[i])
return new
# AdaMax that keeps running average of parameter
评论列表
文章目录