def _GradMom(op, v, out_grad, batch_size, mom=2):
"""Wrapper function for the operation type-specific GradMom functions below.
Inputs:
:op: A tensorflow operation of type in VALID_TYPES.
:v: The read-tensor of the trainable variable consumed by this operation.
:out_grad: The tensor containing the gradient w.r.t. to the output of
the op (as computed by ``tf.gradients``).
:batch_size: Batch size ``m`` (constant integer or scalar int tf.Tensor)
:mom: Integer moment desired (defaults to 2)."""
with tf.name_scope(op.name+"_grad_mom"):
if op.type == "MatMul":
return _MatMulGradMom(op, v, out_grad, batch_size, mom)
elif op.type == "Conv2D":
return _Conv2DGradMom(op, v, out_grad, batch_size, mom)
elif op.type == "Add":
return _AddGradMom(op, v, out_grad, batch_size, mom)
else:
raise ValueError("Don't know how to compute gradient moment for "
"variable {}, consumed by operation of type {}".format(v.name,
op.type))
python类gradients()的实例源码
gradient_moment.py 文件源码
项目:probabilistic_line_search
作者: ProbabilisticNumerics
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
gradient_moment.py 文件源码
项目:probabilistic_line_search
作者: ProbabilisticNumerics
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def _MatMulGradMom(op, W, out_grad, batch_size, mom=2):
"""Computes gradient moment for a weight matrix through a MatMul operation.
Assumes ``Z=tf.matmul(A, W)``, where ``W`` is a d1xd2 weight matrix, ``A``
are the nxd1 activations of the previous layer (n being the batch size).
``out_grad`` is the gradient w.r.t. ``Z``, as computed by ``tf.gradients()``.
No transposes in the MatMul operation allowed.
Inputs:
:op: The MatMul operation
:W: The weight matrix (the tensor, not the variable)
:out_grad: The tensor of gradient w.r.t. to the output of the op
:batch_size: Batch size n (constant integer or scalar int tf.Tensor)
:mom: Integer moment desired (defaults to 2)"""
assert op.type == "MatMul"
t_a, t_b = op.get_attr("transpose_a"), op.get_attr("transpose_b")
assert W is op.inputs[1] and not t_a and not t_b
A = op.inputs[0]
out_grad_pow = tf.pow(out_grad, mom)
A_pow = tf.pow(A, mom)
return tf.mul(batch_size, tf.matmul(A_pow, out_grad_pow, transpose_a=True))
def testUsage(self):
with tf.variable_scope("", custom_getter=snt.custom_getters.stop_gradient):
lin1 = snt.Linear(10, name="linear1")
x = tf.placeholder(tf.float32, [10, 10])
y = lin1(x)
variables = tf.trainable_variables()
variable_names = [v.name for v in variables]
self.assertEqual(2, len(variables))
self.assertIn("linear1/w:0", variable_names)
self.assertIn("linear1/b:0", variable_names)
grads = tf.gradients(y, variables)
names_to_grads = {var.name: grad for var, grad in zip(variables, grads)}
self.assertEqual(None, names_to_grads["linear1/w:0"])
self.assertEqual(None, names_to_grads["linear1/b:0"])
def testOpClip(self):
x = tf.placeholder(tf.float32, shape=[2, 1])
y = snt.clip_gradient(x, 2, 3)
z = tf.reduce_sum(y * y)
dzdy = tf.gradients(z, y)[0]
dzdx = tf.gradients(z, x)[0]
x_np = np.array([[0.5], [2]])
with self.test_session() as sess:
y_np, dzdy_np, dzdx_np = sess.run([y, dzdy, dzdx], feed_dict={x: x_np})
self.assertAllEqual(y_np, x_np)
# We do not expect the gradients with respect to the output to be clipped.
self.assertAllEqual(dzdy_np, np.array([[1], [4]]))
# We expect the gradients with respect to the input to be clipped [2, 3].
self.assertAllEqual(dzdx_np, np.array([[2], [3]]))
def testOpScale(self, x_, scale):
x = tf.placeholder(tf.float32, [1])
y = x * x
y = snt.scale_gradient(y, scale)
dydx = tf.gradients([y], [x])[0]
if scale == 0.0:
self.assertEqual(y.op.type, "StopGradient")
self.assertIs(dydx, None)
else:
if scale == 1.0:
self.assertEqual(y.op.type, "Identity")
else:
self.assertEqual(y.op.type, "ScaleGradient_float32")
with self.test_session() as sess:
dydx_, y_ = sess.run([dydx, y], feed_dict={x: [x_]})
self.assertAlmostEqual(dydx_[0], 2 * scale * x_, places=6)
self.assertAlmostEqual(y_[0], x_ ** 2, places=6)
def testTwoOps(self):
"""Tests that the op can be instantiated twice with appropriate results.
Implementations with inappropriate global registration of gradients will
fail this test.
"""
x = tf.placeholder(tf.float32, [1])
y = x * x
y = snt.scale_gradient(y, 0.1)
y = snt.scale_gradient(y, 0.1)
dydx = tf.gradients([y], [x])[0]
with self.test_session() as sess:
dydx_, y_ = sess.run([dydx, y], feed_dict={x: [3.0]})
self.assertAlmostEqual(dydx_[0], 2 * 0.1**2 * 3.0, places=6)
self.assertAlmostEqual(y_[0], 3.0 ** 2, places=6)
def build_graph(self, kl_first_fixed, weights):
weight_list = list(utils.Utils.flatten(weights.node))
gradients1 = tf.gradients(kl_first_fixed.node, weight_list)
ph_tangent = graph.Placeholder(np.float32, shape=(None,))
gvp = []
start = 0
for g in gradients1:
size = np.prod(g.shape.as_list())
gvp.append(tf.reduce_sum(tf.reshape(g, [-1]) * ph_tangent.node[start:start + size]))
start += size
gradients2 = tf.gradients(gvp, weight_list)
fvp = tf.concat([tf.reshape(g, [-1]) for g in gradients2], axis=0)
self.ph_tangent = ph_tangent
return fvp
def build_graph(self, weights, loss=None, optimizer=None, norm=False, batch_size=None, grad_ys=None):
if loss is not None:
gradients = tf.gradients(loss.node, list(utils.Utils.flatten(weights.node)), grad_ys)
gradients = [tf.check_numerics(g, 'gradient_%d' % i) for i, g in enumerate(gradients)]
if batch_size is not None:
gradients = [g / float(batch_size) for g in gradients]
# store gradients global norm before clipping
self.global_norm = tf.global_norm(gradients)
# clip gradients after global norm has been stored
if norm:
gradients, _ = tf.clip_by_global_norm(gradients, norm)
self.calculate = graph.TfNode(utils.Utils.reconstruct(gradients, weights.node))
if optimizer is not None:
self.ph_gradients = graph.Placeholders(weights)
self.apply = graph.TfNode(optimizer.node.apply_gradients(
utils.Utils.izip(self.ph_gradients.checked, weights.node)))
def hessian_vec_fw(ys, xs, vs, grads=None):
"""Implements Hessian vector product using forward on backward AD.
Args:
ys: Loss function.
xs: Weights, list of tensors.
vs: List of tensors to multiply, for each weight tensor.
Returns:
Hv: Hessian vector product, same size, same shape as xs.
"""
# Validate the input
if type(xs) == list:
if len(vs) != len(xs):
raise ValueError("xs and vs must have the same length.")
if grads is None:
grads = tf.gradients(ys, xs, gate_gradients=True)
return forward_gradients(grads, xs, vs, gate_gradients=True)
def hessian_vec_bk(ys, xs, vs, grads=None):
"""Implements Hessian vector product using backward on backward AD.
Args:
ys: Loss function.
xs: Weights, list of tensors.
vs: List of tensors to multiply, for each weight tensor.
Returns:
Hv: Hessian vector product, same size, same shape as xs.
"""
# Validate the input
if type(xs) == list:
if len(vs) != len(xs):
raise ValueError("xs and vs must have the same length.")
if grads is None:
grads = tf.gradients(ys, xs, gate_gradients=True)
return tf.gradients(grads, xs, vs, gate_gradients=True)
def fisher_vec_fw(ys, xs, vs):
"""Implements Fisher vector product using backward and forward AD.
Args:
ys: Loss function or output variables.
xs: Weights, list of tensors.
vs: List of tensors to multiply, for each weight tensor.
Returns:
J'Jv: Fisher vector product.
"""
# Validate the input
if type(xs) == list:
if len(vs) != len(xs):
raise ValueError("xs and vs must have the same length.")
jv = forward_gradients(ys, xs, vs, gate_gradients=True)
jjv = tf.gradients(ys, xs, jv, gate_gradients=True)
return jjv
def gauss_newton_vec(ys, zs, xs, vs):
"""Implements Gauss-Newton vector product.
Args:
ys: Loss function.
zs: Before output layer (input to softmax).
xs: Weights, list of tensors.
vs: List of perturbation vector for each weight tensor.
Returns:
J'HJv: Guass-Newton vector product.
"""
# Validate the input
if type(xs) == list:
if len(vs) != len(xs):
raise ValueError("xs and vs must have the same length.")
grads_z = tf.gradients(ys, zs, gate_gradients=True)
hjv = forward_gradients(grads_z, xs, vs, gate_gradients=True)
jhjv = tf.gradients(zs, xs, hjv, gate_gradients=True)
return jhjv, hjv
def fisher_vec_z(ys, xs, vs):
"""Implements JJ'v, where v is on the output space.
Args:
ys: Loss function or output variables.
xs: Weights, list of tensors.
vs: List of tensors to multiply, for each weight tensor.
Returns:
JJ'v: Fisher vector product on the output space.
"""
# Validate the input
if type(ys) == list:
if len(vs) != len(ys):
raise ValueError("ys and vs must have the same length.")
jv = tf.gradients(ys, xs, vs, gate_gradients=True)
jjv = forward_gradients(ys, xs, jv, gate_gradients=True)
return jjv
def gauss_newton_vec_z(ys, zs, xs, vs):
"""Implements HJJ'v, where v is on the output space.
Args:
ys: Loss function or output variables.
zs: Before output layer (input to softmax).
xs: Weights, list of tensors.
vs: List of tensors to multiply, for each weight tensor.
Returns:
HJJ'v: Gauss-Newton vector product on the output space.
"""
# Validate the input
if type(zs) == list:
if len(vs) != len(zs):
raise ValueError("zs and vs must have the same length.")
grads_z = tf.gradients(ys, zs, gate_gradients=True)
jv = tf.gradients(zs, xs, vs, gate_gradients=True)
hjjv = forward_gradients(grads_z, xs, jv, gate_gradients=True)
return hjjv
def test_hessian_quadratic(self):
rnd = np.random.RandomState(0)
dtype = tf.float64
with tf.Graph().as_default():
r = tf.Variable(0.0, dtype=dtype)
x = tf.constant(rnd.uniform(-1.0, 1.0, [2, 27]), dtype=dtype, name="x")
w2 = tf.constant(rnd.uniform(-1.0, 1.0, [27, 1]), dtype=dtype, name="w2")
v2 = tf.constant(rnd.uniform(-1.0, 1.0, [27, 1]), dtype=dtype, name="v2")
w2v = tf.add(w2, tf.multiply(r, v2))
h2 = tf.matmul(x, w2v)
y2 = tf.reduce_sum(h2 * h2)
grad_w = tf.gradients(y2, w2)
hv_fw = hessian_vec_fw(y2, [w2v], [v2])
hv_bk = hessian_vec_bk(y2, [w2], [v2])
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
grad_w = sess.run(grad_w)
hv_fw_val = sess.run(hv_fw)
hv_bk_val = sess.run(hv_bk)
np.testing.assert_allclose(hv_fw_val, hv_bk_val, rtol=1e-5)
def main():
sess = tf.Session()
t_input = tf.placeholder(np.float32, name='input') # define the input tensor
image_mean = 117.0
t_preprocessed = tf.expand_dims(t_input-image_mean, 0)
# Build the inference graph
nodes = tmp.vggface16.load('data/vgg_face.mat', t_preprocessed)
img_noise = np.random.uniform(size=(224,224,3)) + 117.0
# Picking some internal layer. Note that we use outputs before applying the ReLU nonlinearity
# to have non-zero gradients for features with negative initial activations.
layer = 'conv5_3'
channel = 140 # picking some feature channel to visualize
img = render_naive(sess, t_input, nodes[layer][:,:,:,channel], img_noise)
showarray(img)
def grad_supervised(self, prob, labels):
"""
return:
loss = 1 / M * sum_i_{1..M} cross_entroy_loss(groundtruth, a_T)
grads = grad(loss, params)
inputs:
prob
labels = (n_batch,)
[tensor variable]
"""
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(prob, labels, name = 'cross_entropy_per_example')
loss = tf.reduce_mean(cross_entropy, name = 'cross_entropy')
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
for i in xrange(len(grads)):
if grads[i] == None:
grads[i] = tf.zeros(shape = tvars[i].get_shape())
return loss, grads
def __init__(self, model):
'''
:param model: Keras model.
This code makes a bunch of assumptions about the model:
- Model has single input
- Embedding is the first layer
- Model output is a scalar (logistic regression)
'''
input_tensor = model.input
embedding_tensor = model.layers[0](input_tensor)
output_tensor = embedding_tensor
for layer in model.layers[1:]:
output_tensor = layer(output_tensor)
grad_tensor, = tf.gradients(output_tensor, [embedding_tensor])
grad_sum_tensor = tf.reduce_sum(grad_tensor, reduction_indices=2)
self.model = model
self.input_tensor = input_tensor
self.grad_sum_tensor = grad_sum_tensor
def _add_train_op(self):
"""Sets self._train_op, op to run for training."""
hps = self._hps
self._lr_rate = tf.maximum(
hps.min_lr, # min_lr_rate.
tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98))
tvars = tf.trainable_variables()
with tf.device(self._get_gpu(self._num_gpus-1)):
grads, global_norm = tf.clip_by_global_norm(
tf.gradients(self._loss, tvars), hps.max_grad_norm)
tf.summary.scalar('global_norm', global_norm)
optimizer = tf.train.GradientDescentOptimizer(self._lr_rate)
tf.summary.scalar('learning rate', self._lr_rate)
self._train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=self.global_step, name='train_step')
def init_ops_for_training(self, critic):
# actors gradients are the gradients for it's output w.r.t it's vars using initial
# gradients provided by critic. this requires that critic was init'd with an
# input_action = actor.output_action (which is natural anyway)
# we wrap the optimiser in namespace since we don't want this as part of copy to
# target networks.
# note that we negate the gradients from critic since we are trying to maximise
# the q values (not minimise like a loss)
with tf.variable_scope("optimiser"):
gradients = tf.gradients(self.output_action,
self.trainable_model_vars(),
tf.neg(critic.q_gradients_wrt_actions()))
gradients = zip(gradients, self.trainable_model_vars())
# potentially clip and wrap with debugging
gradients = util.clip_and_debug_gradients(gradients, opts)
# apply
optimiser = tf.train.GradientDescentOptimizer(opts.actor_learning_rate)
self.train_op = optimiser.apply_gradients(gradients)