def robust_adam(loss, params, learning_rate, beta1=0.9, beta2=0.999, epsilon=1.0e-8):
# Convert NaNs to zeros.
def clear_nan(x):
return T.switch(T.isnan(x), np.float32(0.0), x)
new = OrderedDict()
pg = zip(params, lasagne.updates.get_or_compute_grads(loss, params))
t = theano.shared(lasagne.utils.floatX(0.))
new[t] = t + 1.0
coef = learning_rate * T.sqrt(1.0 - beta2**new[t]) / (1.0 - beta1**new[t])
for p, g in pg:
value = p.get_value(borrow=True)
m = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable)
v = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable)
new[m] = clear_nan(beta1 * m + (1.0 - beta1) * g)
new[v] = clear_nan(beta2 * v + (1.0 - beta2) * g**2)
new[p] = clear_nan(p - coef * new[m] / (T.sqrt(new[v]) + epsilon))
return new
python类utils()的实例源码
def load_model(net, layer='fc8'):
model_values = utils.PickleLoad(os.path.join(model_dir, 'caffe_reference_%s.pkl' % layer))
lasagne.layers.set_all_param_values(net[layer], model_values)
def __init__(self, incoming, target_shape, filter_size, stride=(2, 2),
W=lasagne.init.Normal(0.05), b=lasagne.init.Constant(0.), nonlinearity=relu, **kwargs):
super(Deconv2DLayer, self).__init__(incoming, **kwargs)
self.target_shape = target_shape
self.nonlinearity = (lasagne.nonlinearities.identity if nonlinearity is None else nonlinearity)
self.filter_size = lasagne.utils.as_tuple(filter_size, 2)
self.stride = lasagne.utils.as_tuple(stride, 2)
self.target_shape = target_shape
self.W_shape = (incoming.output_shape[1], target_shape[1], filter_size[0], filter_size[1])
self.W = self.add_param(W, self.W_shape, name="W")
if b is not None:
self.b = self.add_param(b, (target_shape[1],), name="b")
else:
self.b = None
def adam(self,cost, params, learning_rate=0.001, beta1=0.9,
beta2=0.999, epsilon=1e-8):
all_grads = T.grad(cost=cost, wrt=params)
all_grads = total_norm_constraint(all_grads,10)
grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), all_grads)))
not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
t_prev = theano.shared(utils.floatX(0.))
updates = OrderedDict()
t = t_prev + 1
a_t = learning_rate*T.sqrt(1-beta2**t)/(1-beta1**t)
for param, g_t in zip(params, all_grads):
g_t = T.switch(not_finite, 0.1 * param,g_t)
value = param.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
m_t = beta1*m_prev + (1-beta1)*g_t
v_t = beta2*v_prev + (1-beta2)*g_t**2
step = a_t*m_t/(T.sqrt(v_t) + epsilon)
updates[m_prev] = m_t
updates[v_prev] = v_t
updates[param] = param - step
updates[t_prev] = t
return updates
def sorted_values(m):#{{{
if isinstance(m,OrderedDict):
return m.values()
else:
a=sorted(m.keys())
return [m[k] for k in a]#}}}
#
#def enlarge(a,n):#{{{
# if n<0:
# return a[::-n,::-n,:]
# elif n>0:
# return np.repeat(np.repeat(a,n,0),n,1)#}}}
#def tile(network,width,height):#{{{
# network = lasagne.layers.ConcatLayer((network,)*width,axis=2)
# network = lasagne.layers.ConcatLayer((network,)*height,axis=3)
# return network#}}}
#
#def imnorm(x):#{{{
# M=np.max(x)
# m=np.min(x)
# l=M-m
# if l==0:
# l=1.0
# res=((x-m)*1.0/l*255.0).astype('uint8')
# return res#}}}
#def im256(x):#{{{
# M=1.0
# m=0.0
# l=M-m
# if l==0:
# l=1.0
# res=((x-m)*1.0/l*255.0).astype('uint8')
# return res#}}}
#
#def smooth_abs(x):#{{{
# return (x*x+utils.floatX(1e-8))**utils.floatX(0.5);#}}}
#
def inputlayer_oneslike(layer,scale=1.0):#{{{
shape=lasagne.layers.get_output_shape(layer)
res=ZeroLayer(shape,input_var=T.ones(shape,dtype=floatX)*utils.floatX(scale))
return res#}}}
def inputlayer_ones(shape,scale=1.0):#{{{
return ZeroLayer(shape,input_var=T.ones(shape,dtype=floatX)*utils.floatX(scale))#}}}