def adam(self,cost, params, learning_rate=0.001, beta1=0.9,
beta2=0.999, epsilon=1e-8):
all_grads = T.grad(cost=cost, wrt=params)
all_grads = total_norm_constraint(all_grads,10)
grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), all_grads)))
not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
t_prev = theano.shared(utils.floatX(0.))
updates = OrderedDict()
t = t_prev + 1
a_t = learning_rate*T.sqrt(1-beta2**t)/(1-beta1**t)
for param, g_t in zip(params, all_grads):
g_t = T.switch(not_finite, 0.1 * param,g_t)
value = param.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
m_t = beta1*m_prev + (1-beta1)*g_t
v_t = beta2*v_prev + (1-beta2)*g_t**2
step = a_t*m_t/(T.sqrt(v_t) + epsilon)
updates[m_prev] = m_t
updates[v_prev] = v_t
updates[param] = param - step
updates[t_prev] = t
return updates
评论列表
文章目录