def sgd_optimizer(model, lr=0.001, momentum=0.9):
lr = theano.shared(np.array(lr).astype(theano.config.floatX))
# Make sure momentum is a sane value
assert momentum < 1 and momentum >= 0
# the updates of SGD with momentum
updates = []
grads = T.grad(model.costs[0], model.params)
for param, grad in zip(model.params, grads):
param_update = theano.shared(param.get_value()*0.)
updates.append((param, param - lr * param_update))
updates.append((param_update, momentum*param_update + (1. - momentum)*grad))
train_func = theano.function(model.inputs, model.costs, updates=updates)
valid_func = theano.function(model.inputs, model.costs)
return train_func, valid_func
python类grad()的实例源码
def e_step(self, epsilon, q, y, *params):
model = self.model
prior_params = model.get_prior_params(*params)
h = model.prior.step_sample(epsilon, q)
py = model.p_y_given_h(h, *params)
consider_constant = [y] + list(params)
log_py_h = -model.conditional.neg_log_prob(y[None, :, :], py)
if model.prior.has_kl:
KL_q_p = model.prior.step_kl_divergence(q, *prior_params)
else:
log_ph = -model.prior.neg_log_prob(h)
log_qh = -model.posterior.neg_log_prob(h, q[None, :, :])
KL_q_p = (log_qh - log_ph).mean(axis=0)
y_energy = -log_py_h.mean(axis=0)
cost = (y_energy + KL_q_p).mean(axis=0)
grad = theano.grad(cost, wrt=q, consider_constant=consider_constant)
cost = y_energy.mean()
return cost, grad
def update_opt(self, f, target, inputs, reg_coeff):
self.target = target
self.reg_coeff = reg_coeff
params = target.get_params(trainable=True)
constraint_grads = theano.grad(
f, wrt=params, disconnected_inputs='warn')
xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])
def Hx_plain():
Hx_plain_splits = TT.grad(
TT.sum([TT.sum(g * x)
for g, x in zip(constraint_grads, xs)]),
wrt=params,
disconnected_inputs='warn'
)
return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])
self.opt_fun = ext.lazydict(
f_Hx_plain=lambda: ext.compile_function(
inputs=inputs + xs,
outputs=Hx_plain(),
log_name="f_Hx_plain",
),
)
def create_updates(loss, network, opt, learning_rate, momentum, beta1, beta2):
params = lasagne.layers.get_all_params(network, trainable=True)
grads = theano.grad(loss, params)
# if max_norm:
# names = ['crf.U', 'crf.W_h', 'crf.W_c', 'crf.b']
# constraints = [grad for param, grad in zip(params, grads) if param.name in names]
# assert len(constraints) == 4
# scaled_grads = total_norm_constraint(constraints, max_norm=max_norm)
# counter = 0
# for i in xrange(len(params)):
# param = params[i]
# if param.name in names:
# grads[i] = scaled_grads[counter]
# counter += 1
# assert counter == 4
if opt == 'adam':
updates = adam(grads, params=params, learning_rate=learning_rate, beta1=beta1, beta2=beta2)
elif opt == 'momentum':
updates = nesterov_momentum(grads, params=params, learning_rate=learning_rate, momentum=momentum)
else:
raise ValueError('unkown optimization algorithm: %s' % opt)
return updates
conjugate_gradient_optimizer.py 文件源码
项目:rllabplusplus
作者: shaneshixiang
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def update_opt(self, f, target, inputs, reg_coeff):
self.target = target
self.reg_coeff = reg_coeff
params = target.get_params(trainable=True)
constraint_grads = theano.grad(
f, wrt=params, disconnected_inputs='warn')
xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])
def Hx_plain():
Hx_plain_splits = TT.grad(
TT.sum([TT.sum(g * x)
for g, x in zip(constraint_grads, xs)]),
wrt=params,
disconnected_inputs='warn'
)
return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])
self.opt_fun = ext.lazydict(
f_Hx_plain=lambda: ext.compile_function(
inputs=inputs + xs,
outputs=Hx_plain(),
log_name="f_Hx_plain",
),
)
def fit(self, weights, o_error, tpo ):
gradients = T.grad(o_error ,weights)
updates = []
for c, v, w, g in zip(self.t_cache, self.t_velocity, weights,gradients):
new_velocity = T.sub( T.mul(tpo["momentum_rate"], v) , T.mul(tpo["learn_rate"], g) )
new_cache = T.add( T.mul(tpo["decay_rate"] , c) , T.mul(T.sub( 1, tpo["decay_rate"]) , T.sqr(g)))
new_weights = T.sub(T.add(w , new_velocity) , T.true_div( T.mul(g,tpo["learn_rate"]) , T.sqrt(T.add(new_cache,0.1**8))))
updates.append((w, new_weights))
updates.append((v, new_velocity))
updates.append((c, new_cache))
return updates
###### Nesterov momentum
########################################
def fit(self, weights, o_error, tpo):
updates = []
gradients = theano.grad(o_error, weights)
for c, w, g in zip(self.t_cache, weights, gradients):
new_cache = tpo["decay_rate"] * c + ( 1- tpo["decay_rate"]) * T.sqr(g)
new_weights = w - (g * tpo["learn_rate"]) / T.sqrt(new_cache + 0.1**8)
updates.append((w, new_weights))
updates.append((c, new_cache))
return updates
###### ADADELTA
########################################
def fit(self, weights, o_error, tpo):
gradients = theano.grad(o_error, weights)
updates = []
for v, w, g in zip(self.t_velocity, weights, gradients):
#gradient = T.grad(o_error ,w)
new_velocity = tpo["momentum_rate"] * v - tpo["learn_rate"] * g
new_weights = w + new_velocity
updates.append((w, new_weights))
updates.append((v, new_velocity))
return updates
###### Vanilla SGD
########################################
test_conv2d_model_tensorflow_ordering.py 文件源码
项目:deeplift
作者: kundajelab
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def test_convert_conv2d_model_compute_scores(self):
if (self.keras_version <= 0.2):
pass
else:
deeplift_model = kc.convert_sequential_model(
model=self.keras_model)
deeplift_contribs_func = deeplift_model.\
get_target_contribs_func(
find_scores_layer_idx=0,
target_layer_idx=-2)
np.testing.assert_almost_equal(
deeplift_contribs_func(task_idx=0,
input_data_list=[self.inp],
batch_size=10,
progress_update=None),
#when biases are 0 and ref is 0, deeplift is the same as grad*inp
self.grad_func(self.inp)*self.inp, decimal=6)
def test_convert_conv1d_model_compute_scores(self):
if (self.run_graph_tests==False):
return
deeplift_model = kc.convert_graph_model(
model=self.keras_model,
nonlinear_mxts_mode=NonlinearMxtsMode.Rescale)
deeplift_contribs_func = deeplift_model.\
get_target_contribs_func(
find_scores_layer_name=["inp1", "inp2"],
pre_activation_target_layer_name="output_preact")
grads_inp1, grads_inp2 = self.grad_func(self.inp1, self.inp2)
np.testing.assert_almost_equal(
np.array(deeplift_contribs_func(task_idx=0,
input_data_list={
'inp1': self.inp1,
'inp2': self.inp2},
input_references_list={
'inp1': np.zeros_like(self.inp1),
'inp2': np.zeros_like(self.inp2)},
batch_size=10,
progress_update=None)),
#when biases are 0 and ref is 0, deeplift is the same as grad*inp
np.array([grads_inp1*self.inp1,
grads_inp2*self.inp2]), decimal=6)
def build_bprop_graph(self):
optimizer = self.get_optimizer()
# there are either costs assigned to specific params
# OR let blocks do the gradient
costs = self.link_here('costs').keys()
isinstance_check = [isinstance(c, ParametersLink) for c in costs]
if any(isinstance_check):
assert all(isinstance_check), "Some costs have parameters associated "+\
"to them and others don't. All costs need to be binded."
grads = OrderedDict()
for cost in costs:
grads.update(zip(cost.parameters,
theano.grad(cost.model_var, cost.params)))
cost = None
else:
cost = sum(costs)
grads = None
algorithm = GradientDescent(
cost=cost, gradients=grads,
parameters=self.model_parameters,
step_rule=optimizer)
self.algorithm = algorithm
def update_opt(self, f, target, inputs, reg_coeff):
self.target = target
self.reg_coeff = reg_coeff
params = target.get_params(trainable=True)
constraint_grads = theano.grad(
f, wrt=params, disconnected_inputs='warn')
xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])
def Hx_plain():
Hx_plain_splits = TT.grad(
TT.sum([TT.sum(g * x)
for g, x in zip(constraint_grads, xs)]),
wrt=params,
disconnected_inputs='warn'
)
return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])
self.opt_fun = ext.lazydict(
f_Hx_plain=lambda: ext.compile_function(
inputs=inputs + xs,
outputs=Hx_plain(),
log_name="f_Hx_plain",
),
)
def update_opt(self, f, target, inputs, reg_coeff):
self.target = target
self.reg_coeff = reg_coeff
params = target.get_params(trainable=True)
constraint_grads = theano.grad(
f, wrt=params, disconnected_inputs='warn')
xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])
def Hx_plain():
Hx_plain_splits = TT.grad(
TT.sum([TT.sum(g * x)
for g, x in zip(constraint_grads, xs)]),
wrt=params,
disconnected_inputs='warn'
)
return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])
self.opt_fun = ext.lazydict(
f_Hx_plain=lambda: ext.compile_function(
inputs=inputs + xs,
outputs=Hx_plain(),
log_name="f_Hx_plain",
),
)
def test_retNone1(self):
"""Test that it is not ok to return None from op.grad()"""
class retNone(gof.op.Op):
__props__ = ()
def make_node(self):
inputs = [theano.tensor.vector()]
outputs = [theano.tensor.vector()]
return gof.Apply(self, inputs, outputs)
def grad(self, inp, grads):
x, = inp
gz, = grads
pass
a = retNone().make_node()
self.assertRaises(TypeError, grad_sources_inputs, [(a.out, one)], None)
def test_1in_1out(self):
"""Test grad is called correctly for a 1-to-1 op"""
gval = theano.tensor.matrix()
class O(gof.op.Op):
__props__ = ()
def make_node(self):
inputs = [theano.tensor.matrix()]
outputs = [theano.tensor.matrix()]
return gof.Apply(self, inputs, outputs)
def grad(self, inp, grads):
return gval,
a1 = O().make_node()
g = grad_sources_inputs([(a1.outputs[0], one)], None)
self.assertTrue(g[a1.inputs[0]] is gval)
def test_1in_Nout(self):
"""Test grad is called correctly for a 1-to-many op"""
gval = theano.tensor.matrix()
class O(gof.op.Op):
__props__ = ()
def make_node(self):
inputs = [theano.tensor.matrix()]
outputs = [theano.tensor.scalar(), theano.tensor.scalar()]
return gof.Apply(self, inputs, outputs)
def grad(self, inp, grads):
x, = inp
gz1, gz2 = grads
return gval,
a1 = O().make_node()
g = grad_sources_inputs([(a1.outputs[0], one)], None)
self.assertTrue(g[a1.inputs[0]] is gval)
def test_Nin_1out(self):
"""Test grad is called correctly for a many-to-1 op"""
gval0 = theano.tensor.scalar()
gval1 = theano.tensor.scalar()
class O(gof.op.Op):
__props__ = ()
def make_node(self):
inputs = [theano.tensor.scalar(), theano.tensor.scalar()]
outputs = [theano.tensor.matrix()]
return gof.Apply(self, inputs, outputs)
def grad(self, inp, grads):
x0, x1 = inp
gz, = grads
return (gval0, gval1)
a1 = O().make_node()
g = grad_sources_inputs([(a1.outputs[0], one)], None)
self.assertTrue(g[a1.inputs[0]] is gval0)
self.assertTrue(g[a1.inputs[1]] is gval1)
def test_Nin_Nout(self):
"""Test grad is called correctly for a many-to-many op"""
gval0 = theano.tensor.matrix()
gval1 = theano.tensor.matrix()
class O(gof.op.Op):
__props__ = ()
def make_node(self):
inputs = [theano.tensor.matrix(), theano.tensor.matrix()]
outputs = [theano.tensor.matrix(), theano.tensor.matrix()]
return gof.Apply(self, inputs, outputs)
def grad(self, inp, grads):
return gval0, gval1
a1 = O().make_node()
g = grad_sources_inputs([(a1.outputs[0], one)], None)
self.assertTrue(g[a1.inputs[0]] is gval0)
self.assertTrue(g[a1.inputs[1]] is gval1)
def test_unimplemented_grad_grad(self):
# tests that unimplemented grads are caught in the grad method
class DummyOp(gof.Op):
__props__ = ()
def make_node(self, x):
return gof.Apply(self, [x], [x.type()])
def grad(self, inputs, output_grads):
return [theano.gradient.grad_not_implemented(self, 0, inputs[0])]
a = theano.tensor.scalar()
b = DummyOp()(a)
self.assertRaises(TypeError, theano.gradient.grad, b, a)
def test_downcast_dtype(self):
# Test that the gradient of a cost wrt a float32 variable does not
# get upcasted to float64.
# x has dtype float32, regardless of the value of floatX
x = theano.tensor.fscalar('x')
y = x * 2
z = theano.tensor.lscalar('z')
c = y + z
dc_dx, dc_dy, dc_dz, dc_dc = theano.grad(c, [x, y, z, c])
# The dtype of dc_dy and dc_dz can be either float32 or float64,
# that might depend on floatX, but is not specified.
assert dc_dc.dtype in ('float32', 'float64')
assert dc_dz.dtype in ('float32', 'float64')
assert dc_dy.dtype in ('float32', 'float64')
# When the output gradient of y is passed to op.grad, it should
# be downcasted to float32, so dc_dx should also be float32
assert dc_dx.dtype == 'float32'
def test_grad_constant(self):
# Test that the gradient handles Constants and consider_constant variables
# consistently
x = theano.tensor.scalar()
y = theano.tensor.scalar()
z_x = x + y
z_one = one + y
g_x = theano.tensor.grad(z_x, x, consider_constant=[x])
g_one = theano.tensor.grad(z_one, one)
f = theano.function([x, y], [g_x, g_one])
g_x, g_one = f(1, .5)
if not np.allclose(g_x, g_one):
raise AssertionError("Gradient using consider constant is " +
str(g_x) +
" but gradient with respect to the same Constant is " +
str(g_one))
def test_dxdx():
# Tests that the gradient of a scalar with respect to itself is 1
# I use an integer in this case because people keep changing this
# gradient to be 0 on integers but according to our interpretation
# of the gradient as defined in the Op contract, it should be 1.
# If you feel the need to change this unit test you are probably
# modifying the Op contract and should definitely get the approval
# of multiple people on theano-dev.
x = theano.tensor.iscalar()
g = theano.tensor.grad(x, x)
g = g.eval({x: 12})
assert np.allclose(g, 1.)
def test_undefined_cost_grad():
# Tests that if we say the cost is not differentiable via the
# known_grads mechanism, it is treated as such by the rest of the
# system.
# This is so that Ops that are built around minigraphs like OpFromGraph
# and scan can implement Op.grad by passing ograds to known_grads
x = theano.tensor.iscalar()
y = theano.tensor.iscalar()
cost = x + y
assert cost.dtype in theano.tensor.discrete_dtypes
try:
theano.tensor.grad(cost, [x, y], known_grads={cost: NullType()()})
except theano.gradient.NullTypeGradError:
return
raise AssertionError("An undefined gradient has been ignored.")
def test_disconnected_cost_grad():
# Tests that if we say the cost is disconnected via the
# known_grads mechanism, it is treated as such by the rest of the
# system.
# This is so that Ops that are built around minigraphs like OpFromGraph
# and scan can implement Op.grad by passing ograds to known_grads
x = theano.tensor.iscalar()
y = theano.tensor.iscalar()
cost = x + y
assert cost.dtype in theano.tensor.discrete_dtypes
try:
theano.tensor.grad(cost, [x, y], known_grads={cost: gradient.DisconnectedType()()}, disconnected_inputs='raise')
except theano.gradient.DisconnectedInputError:
return
raise AssertionError("A disconnected gradient has been ignored.")
def test_grad(self):
T = theano.tensor
a = np.asarray(self.rng.randn(5, 5),
dtype=config.floatX)
x = T.matrix('x')
expressions_gradients = [
(x * gradient.consider_constant(x), x),
(x * gradient.consider_constant(T.exp(x)), T.exp(x)),
(gradient.consider_constant(x), T.constant(0.)),
(x**2 * gradient.consider_constant(x), 2 * x**2),
]
for expr, expr_grad in expressions_gradients:
g = gradient.grad(expr.sum(), x)
# gradient according to theano
f = theano.function([x], g, on_unused_input='ignore')
# desired gradient
f2 = theano.function([x], expr_grad, on_unused_input='ignore')
assert np.allclose(f(a), f2(a))
def test_grad(self):
T = theano.tensor
a = np.asarray(self.rng.randn(5, 5),
dtype=config.floatX)
x = T.matrix('x')
expressions_gradients = [
(x * gradient.zero_grad(x), x),
(x * gradient.zero_grad(T.exp(x)), T.exp(x)),
(gradient.zero_grad(x), T.constant(0.)),
(x**2 * gradient.zero_grad(x), 2 * x**2),
]
for expr, expr_grad in expressions_gradients:
g = gradient.grad(expr.sum(), x)
# gradient according to theano
f = theano.function([x], g, on_unused_input='ignore')
# desired gradient
f2 = theano.function([x], expr_grad, on_unused_input='ignore')
assert np.allclose(f(a), f2(a))
def test_csm_grad(self):
for sparsetype in ('csr', 'csc'):
x = tensor.vector()
y = tensor.ivector()
z = tensor.ivector()
s = tensor.ivector()
call = getattr(sp, sparsetype + '_matrix')
spm = call(random_lil((300, 400), config.floatX, 5))
out = tensor.grad(dense_from_sparse(
CSM(sparsetype)(x, y, z, s)
).sum(), x)
self._compile_and_check([x, y, z, s],
[out],
[spm.data, spm.indices, spm.indptr,
spm.shape],
(CSMGrad, CSMGradC)
)
def test_other_grad_tests(self):
x = theano.tensor.dmatrix()
x_val1 = numpy.array([[1, 2, 3], [0, 5, 6], [0, 0, 9]],
dtype='float32')
x_val2 = numpy.array([[1, 2, 0], [0, 5, 6], [7, 8, 9], [9, 10, 0]],
dtype='float32')
rng = rng = numpy.random.RandomState(43)
p = Prod(axis=1)
grad_p = theano.tensor.grad(p(x).sum(), x)
grad_fn = theano.function([x], grad_p, mode=self.mode)
assert numpy.allclose(grad_fn(x_val1), [[6., 3., 2.], [30., 0.,
0.], [0., 0., 0.]])
assert numpy.allclose(grad_fn(x_val2), [[0., 0., 2.], [30.,
0., 0.], [72., 63., 56.], [0., 0., 90.]])
p_axis0 = Prod(axis=0)
grad_p_axis0 = theano.tensor.grad(p_axis0(x).sum(), x)
grad_fn_axis0 = theano.function([x], grad_p_axis0, mode=self.mode)
assert numpy.allclose(grad_fn_axis0(x_val2), [[0., 400.,
0.], [63., 160., 0.], [0., 100., 0.], [0., 80., 0.]])
tensor.verify_grad(p, [x_val1], rng=rng, mode=self.mode)
def test_gt_grad():
"""A user test that failed.
Something about it made Elemwise.grad return something that was
too complicated for get_scalar_constant_value to recognize as being 0, so
gradient.grad reported that it was not a valid gradient of an
integer.
"""
floatX = config.floatX
T = theano.tensor
input_ = T.vector(dtype=floatX)
random_values = numpy.random.RandomState(1234).uniform(
low=-1, high=1, size=(2, 2))
W_values = numpy.asarray(random_values, dtype=floatX)
W = theano.shared(value=W_values, name='weights')
correct_score = T.dot(input_, W)
wrong_input = T.vector(dtype=floatX)
wrong_score = theano.clone(correct_score, {input_: wrong_input})
# Hinge loss
scores = T.ones_like(correct_score) - correct_score + wrong_score
cost = (scores * (scores > 0)).sum()
T.grad(cost, input_)
def test_grad_2d_inc_set_subtensor(self):
for n_shape, m_shape in [
[(2, 3), (2, 2)],
[(3, 2), (2, 2)],
[(3, 2), (1, 2)],
[(3, 2), (2,)],
]:
for op in [inc_subtensor, set_subtensor]:
subi = 2
data = numpy.asarray(rand(*n_shape), dtype=self.dtype)
n = self.shared(data)
z = scal.constant(subi)
m = matrix('m', dtype=self.dtype)
mv = numpy.asarray(rand(*m_shape), dtype=self.dtype)
t = op(n[:z, :z], m)
gn, gm = theano.tensor.grad(theano.tensor.sum(t), [n, m])
utt.verify_grad(lambda m: op(n[:z, :z], m), [mv])
utt.verify_grad(lambda nn: op(nn[:z, :z], mv), [data])