def AdaMaxAvg2(ws, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, n_accum=1):
if n_accum == 1:
return AdaMaxAvg(ws, objective, alpha, beta1, beta2, beta3)
print 'AdaMax_Avg2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3,'n_accum:',n_accum
gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='raise')
new = OrderedDict()
from theano.ifelse import ifelse
it = G.sharedf(0.)
new[it] = it + 1
reset = T.eq(T.mod(it,n_accum), 0)
update = T.eq(T.mod(it,n_accum), n_accum-1)
ws_avg = []
for j in range(len(ws)):
w_avg = {}
for i in ws[j]:
_w = ws[j][i]
_g = gs[j][i]
#_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's
mom1 = G.sharedf(_w.get_value() * 0.)
_max = G.sharedf(_w.get_value() * 0.)
w_avg[i] = G.sharedf(_w.get_value())
g_sum = G.sharedf(_w.get_value() * 0.)
new[g_sum] = ifelse(reset, _g, g_sum + _g)
new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1)
new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max)
new[_w] = ifelse(update, _w + alpha * new[mom1] / new[_max], _w)
new[w_avg[i]] = ifelse(update, beta3 * new[_w] + (1.-beta3) * w_avg[i], w_avg[i])
ws_avg += [w_avg]
return new, ws_avg
评论列表
文章目录