def grad(self, inputs, g):
# g[1:] is all integers, so their Jacobian in this op
# is 0. We thus don't need to worry about what their values
# are.
# if g[0] is disconnected, then this op doesn't contribute
# any gradient anywhere. but we know that at least one of
# g[1:] is connected, or this grad method wouldn't have been
# called, so we should report zeros
(csm,) = inputs
if isinstance(g[0].type, DisconnectedType):
return [csm.zeros_like()]
data, indices, indptr, shape = csm_properties(csm)
return [CSM(csm.format)(g[0], indices, indptr, shape)]
# don't make this a function or it breaks some optimizations below
python类gradient()的实例源码
def perform(self, node, inputs, outputs):
(a_indices, a_indptr, b, g_ab) = inputs
(out,) = outputs
g_a_data = numpy.zeros(a_indices.shape, dtype=g_ab.dtype)
for i in xrange(len(a_indptr) - 1): # loop over rows
ind0 = a_indptr[i]
ind1 = a_indptr[i + 1]
# loop over values in that row (columns)
for j_idx in xrange(ind0, ind1):
j = a_indices[j_idx]
# grad is dot product of i-th row of gradient with j-th row of b
# Depending on the type of g_ab and b (sparse or dense),
# the following dot product can result in a scalar or
# a (1, 1) sparse matrix.
dot_val = numpy.dot(g_ab[i], b[j].T)
if isinstance(dot_val, scipy.sparse.spmatrix):
dot_val = dot_val[0, 0]
g_a_data[j_idx] = dot_val
out[0] = g_a_data
def grad(self, inputs, g_outputs):
r"""The gradient function should return
.. math:: V\frac{\partial X^{-1}}{\partial X},
where :math:`V` corresponds to ``g_outputs`` and :math:`X` to
``inputs``. Using the `matrix cookbook
<http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=3274>`_,
one can deduce that the relation corresponds to
.. math:: (X^{-1} \cdot V^{T} \cdot X^{-1})^T.
"""
x, = inputs
xi = self(x)
gz, = g_outputs
# TT.dot(gz.T,xi)
return [-matrix_dot(xi, gz.T, xi).T]
def grad(self, inp, cost_grad):
"""
Notes
-----
The gradient is currently implemented for matrices only.
"""
a, val = inp
grad = cost_grad[0]
if (a.dtype.startswith('complex')):
return [None, None]
elif a.ndim > 2:
raise NotImplementedError('%s: gradient is currently implemented'
' for matrices only' %
self.__class__.__name__)
wr_a = fill_diagonal(grad, 0) # valid for any number of dimensions
# diag is only valid for matrices
wr_val = theano.tensor.nlinalg.diag(grad).sum()
return [wr_a, wr_val]
def binary_crossentropy(output, target):
"""
Compute the crossentropy of binary random variables.
Output and target are each expectations of binary random
variables; target may be exactly 0 or 1 but output must
lie strictly between 0 and 1.
Notes
-----
We could use the x log y op to support output=0 and output=1.
The gradient would still be undefined though.
We do not sum, crossentropy is computed by component.
TODO : Rewrite as a scalar, and then broadcast to tensor.
"""
return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output))
def dnn_gradweight(img, topgrad,
kerns_shp,
border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
"""
GPU convolution gradient with respect to weight using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
FIXME parameters doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = theano.tensor.as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns_shp)
out = gpu_alloc_empty(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
def dnn_gradweight3d(img, topgrad,
kerns_shp,
border_mode='valid', subsample=(1, 1, 1),
conv_mode='conv'):
"""
GPU convolution gradient with respect to weight using cuDNN from NVIDIA.
The memory layout to use is 'bct01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
FIXME parameters doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = theano.tensor.as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns_shp)
out = gpu_alloc_empty(*kerns_shp)
return GpuDnnConv3dGradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad,
img_shp,
border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
"""
GPU convolution gradient with respect to input using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
FIXME parameters doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = theano.tensor.as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img_shp, kerns.shape)
out = gpu_alloc_empty(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
def dnn_gradinput3d(kerns, topgrad,
img_shp,
border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
"""
GPU convolution gradient with respect to input using cuDNN from NVIDIA.
The memory layout to use is 'bct01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
FIXME parameters doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = theano.tensor.as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img_shp, kerns.shape)
out = gpu_alloc_empty(*img_shp)
return GpuDnnConv3dGradI()(kerns, topgrad, out, desc)
def grad(self, inputs, gout):
(cond, ift, iff) = inputs
(gz,) = gout
first_part = switch(cond, gz, 0.)
second_part = switch(cond, 0., gz)
out = self(cond, ift, iff)
if out.type.dtype in discrete_types:
first_part = 0.
second_part = 0.
# cond does affect the elements of the output so it is connected.
# For the sake of making the gradient convenient we assume that
# condition + epsilon always triggers the same branch as condition
condition_grad = cond.zeros_like().astype(theano.config.floatX)
return (condition_grad, first_part, second_part)
def grad(self, inputs, gout):
(x, y) = inputs
(gz,) = gout
if gz.type in complex_types:
# max is currently defined for complex_types,
# but the gradient for complex is not.
raise NotImplementedError()
output = self(x, y)
if output.type in discrete_types:
return [x.zeros_like().astype(theano.config.floatX),
y.zeros_like().astype(theano.config.floatX)]
gx = eq(output, x) * gz
gy = eq(output, y) * gz
return (gx, gy)
def grad(self, inputs, gout):
(x, y) = inputs
(gz,) = gout
if x.type in complex_types:
raise NotImplementedError()
# If the output of this op is discrete, then it
# it is locally flat everywhere, so the gradient
# through it is 0.
# This is different from it not being connected
# to the output; x/y is still a function of x
# and y; it's just a step function.
if all(a.dtype in discrete_types for a in (x, y)):
return [x.zeros_like(), y.zeros_like()]
first_part = gz / y
if y.type in complex_types:
raise NotImplementedError()
second_part = -(gz * x) / (y * y)
return first_part, second_part
def grad(self, inputs, gout):
(y, x) = inputs
(gz,) = gout
if gz.type in complex_types:
raise NotImplementedError()
else:
if self(x, y).type in discrete_types:
if x.type in discrete_types:
gx = x.zeros_like(dtype=theano.config.floatX)
else:
gx = x.zeros_like()
if y.type in discrete_types:
gy = y.zeros_like(dtype=theano.config.floatX)
else:
gy = y.zeros_like()
return [gx, gy]
# If the output is float, the gradient should flow,
# even if the inputs are ints
return [gz * x / (sqr(x) + sqr(y)),
gz * neg(y) / (sqr(x) + sqr(y))]
def grad_not_implemented(op, x_pos, x, comment=""):
"""
Return an un-computable symbolic variable of type `x.type`.
If any call to tensor.grad results in an expression containing this
un-computable variable, an exception (NotImplementedError) will be
raised indicating that the gradient on the
`x_pos`'th input of `op` has not been implemented. Likewise if
any call to theano.function involves this variable.
Optionally adds a comment to the exception explaining why this
gradient is not implemented.
"""
return (NullType((
"This variable is Null because the grad method for "
"input %s (%s) of the %s op is not implemented. %s"
) % (x_pos, x, op, comment)))()
def grad_undefined(op, x_pos, x, comment=""):
"""
Return an un-computable symbolic variable of type `x.type`.
If any call to tensor.grad results in an expression containing this
un-computable variable, an exception (GradUndefinedError) will be
raised indicating that the gradient on the
`x_pos`'th input of `op` is mathematically undefined. Likewise if
any call to theano.function involves this variable.
Optionally adds a comment to the exception explaining why this
gradient is not defined.
"""
return (NullType(
(
"This variable is Null because the grad method for "
"input %s (%s) of the %s op is mathematically undefined. %s"
) % (x_pos, x, op, comment)))()
def abs_rel_errors(self, g_pt):
"""Return the abs and rel error of gradient estimate `g_pt`
`g_pt` must be a list of ndarrays of the same length as self.gf,
otherwise a ValueError is raised.
Corresponding ndarrays in `g_pt` and `self.gf` must have the same
shape or ValueError is raised.
"""
if len(g_pt) != len(self.gf):
raise ValueError('argument has wrong number of elements',
len(g_pt))
errs = []
for i, (a, b) in enumerate(zip(g_pt, self.gf)):
if a.shape != b.shape:
raise ValueError('argument element %i has wrong shape %s' % (
i, str((a.shape, b.shape))))
errs.append(numeric_grad.abs_rel_err(a, b))
return errs
def zero_grad(x):
"""
Consider an expression constant when computing gradients.
The expression itself is unaffected, but when its gradient is
computed, or the gradient of another expression that this
expression is a subexpression of, it will be backpropagated
through with a value of zero. In other words, the gradient of
the expression is truncated to 0.
:param x: A Theano expression whose gradient should be truncated.
:return: The expression is returned unmodified, but its gradient
is now truncated to 0.
"""
return zero_grad_(x)
def disconnected_grad(x):
"""
Consider an expression constant when computing gradients,
while effectively not backpropagating through it.
The expression itself is unaffected, but when its gradient is
computed, or the gradient of another expression that this
expression is a subexpression of, it will not be backpropagated
through. This is effectively equivalent to truncating the gradient
expression to 0, but is executed faster than zero_grad(), which stilll
has to go through the underlying computational graph related to the
expression.
:param x: A Theano expression whose gradient should not be
backpropagated through.
:return: The expression is returned unmodified, but its gradient
is now effectively truncated to 0.
"""
return disconnected_grad_(x)
def grad(self, inputs, ograds):
ref, values, ref_dim, val_dim = inputs[:4]
hash_struct = inputs[4:]
ograd = ograds[0]
ref_dim = get_scalar_constant_value(ref_dim)
val_dim = get_scalar_constant_value(val_dim)
def _conv(x):
return GaussianFilter()(ref, x, ref_dim, val_dim, *hash_struct)
# Since the kernels are separable and symmetric, the gradient w.r.t.
# input is just the same filtering applied to the output grads.
grad_i = _conv(ograd)
def _gradr(r_i, vals, og, *args):
return (og * (_conv(vals*r_i) - r_i*_conv(vals)) +
vals * (_conv(og*r_i) - r_i*_conv(og)))
grad_r, _ = theano.scan(fn=_gradr, sequences=[ref],
non_sequences=[values, ograd] + hash_struct,
outputs_info=None)
grad_r = grad_r.sum(axis=1, acc_dtype="float32")
grads = [DisconnectedType()() for i in range(len(inputs))]
grads[0] = grad_r
grads[1] = grad_i
return grads
def grad(self, inputs, ograds):
ref, values, ref_dim, val_dim = inputs[:4]
hash_struct = inputs[4:]
ograd = ograds[0]
ref_dim = get_scalar_constant_value(ref_dim)
val_dim = get_scalar_constant_value(val_dim)
def _conv(x):
return GpuGaussianFilter()(ref, x, ref_dim, val_dim, *hash_struct)
# Since the kernels are separable and symmetric, the gradient w.r.t.
# input is just the same filtering applied to the output grads.
grad_i = _conv(ograd)
def _gradr(r_i, vals, og, *args):
return (og * (_conv(vals*r_i) - r_i*_conv(vals)) +
vals * (_conv(og*r_i) - r_i*_conv(og)))
grad_r, _ = theano.scan(fn=_gradr, sequences=[ref],
non_sequences=[values, ograd] + hash_struct,
outputs_info=None)
grad_r = grad_r.sum(axis=1, acc_dtype="float32")
grads = [DisconnectedType()() for i in range(len(inputs))]
grads[0] = grad_r
grads[1] = grad_i
return grads
def grad(self, inp, grads):
img, ws, stride, pad = inp
grad, = grads
grad = gpu_contiguous(grad)
out = self(img, ws, stride, pad)
g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
def L_op(self, inputs, outputs, output_grads):
desc, w, x, hx = inputs[:4]
cx = inputs[4] if len(inputs) == 5 else None
reserve, y, hy = outputs[:3]
_, dy, dhy = output_grads[:3]
dcy = output_grads[3] if len(output_grads) == 4 else None
# Since the op return two outputs which contain essentially
# the same information, the user will most likely only use one
# of them. This leads to the situation that the other is
# considered "disconnected" by theano in the gradient.
# However we know that this isn't really the case so we fix it
# here.
# If all the ys are disconnected, then you get a boring
# gradient instead of an error. But in that case you
# shouldn't call this method anyway.
if isinstance(dy.type, DisconnectedType):
dy = as_gpuarray_variable(y.zeros_like(),
context_name=y.type.context_name)
if isinstance(dhy.type, DisconnectedType):
dhy = None
if dcy and isinstance(dcy.type, DisconnectedType):
dcy = None
dinputs = GpuDnnRNNGradInputs(rnn_mode=self.rnn_mode,
grad_h=(dhy is not None),
grad_c=(dcy is not None))(
desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True)
reserve2, dx, dhx = dinputs[:3]
dw = GpuDnnRNNGradWeights()(
desc, x, hx, y, reserve2, w)
res = [DisconnectedType()(), dw, dx, dhx]
if cx is not None:
res.append(dinputs[3]) # dcx
return res
def grad(self, inputs, output_grads):
gout, = output_grads
s = inputs[1]
gf = curfft_op(gout, s)
# Multiply the last dimension of the gradient by 2, they represent
# both positive and negative frequencies, except the first
# and last elements (for even transforms) which are unique.
idx = [slice(None)] * (gf.ndim - 2) \
+ [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
gf = T.set_subtensor(gf[idx], gf[idx] * 2)
return [gf, DisconnectedType()()]
def test_Rop_dot_bug_18Oct2013_Jeremiah(self):
# This test refers to a bug reported by Jeremiah Lowin on 18th Oct
# 2013. The bug consists when through a dot operation there is only
# one differentiable path (i.e. there is no gradient wrt to one of
# the inputs).
x = tensor.arange(20.0).reshape([1, 20])
v = theano.shared(numpy.ones([20]))
d = tensor.dot(x, v).sum()
tensor.Rop(tensor.grad(d, v), v, v)
def grad(self, inputs, g_outputs):
x, ind1, ind2 = inputs
gout, = g_outputs
return [get_item_2lists_grad(x, ind1, ind2, gout),
grad_undefined(self, 1, ind1, "No gradient for this input"),
grad_undefined(self, 1, ind2, "No gradient for this input")]
def dot(x, y):
"""
Operation for efficiently calculating the dot product when
one or all operands is sparse. Supported format are CSC and CSR.
The output of the operation is dense.
Parameters
----------
x
Sparse or dense matrix variable.
y
Sparse or dense matrix variable.
Returns
-------
The dot product `x`.`y` in a dense format.
Notes
-----
The grad implemented is regular, i.e. not structured.
At least one of `x` or `y` must be a sparse matrix.
When the operation has the form dot(csr_matrix, dense)
the gradient of this operation can be performed inplace
by UsmmCscDense. This leads to significant speed-ups.
"""
if hasattr(x, 'getnnz'):
x = as_sparse_variable(x)
if hasattr(y, 'getnnz'):
y = as_sparse_variable(y)
x_is_sparse_variable = _is_sparse_variable(x)
y_is_sparse_variable = _is_sparse_variable(y)
if not x_is_sparse_variable and not y_is_sparse_variable:
raise TypeError()
return _dot(x, y)
def grad(self, inputs, g_outputs):
r"""The gradient function should return
.. math:: \sum_n\left(W_n\frac{\partial\,w_n}
{\partial a_{ij}} +
\sum_k V_{nk}\frac{\partial\,v_{nk}}
{\partial a_{ij}}\right),
where [:math:`W`, :math:`V`] corresponds to ``g_outputs``,
:math:`a` to ``inputs``, and :math:`(w, v)=\mbox{eig}(a)`.
Analytic formulae for eigensystem gradients are well-known in
perturbation theory:
.. math:: \frac{\partial\,w_n}
{\partial a_{ij}} = v_{in}\,v_{jn}
.. math:: \frac{\partial\,v_{kn}}
{\partial a_{ij}} =
\sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m}
"""
x, = inputs
w, v = self(x)
# Replace gradients wrt disconnected variables with
# zeros. This is a work-around for issue #1063.
gw, gv = _zero_disconnected([w, v], g_outputs)
return [EighGrad(self.UPLO)(x, w, v, gw, gv)]
def perform(self, node, inputs, outputs):
"""
Implements the "reverse-mode" gradient for the eigensystem of
a square matrix.
"""
x, w, v, W, V = inputs
N = x.shape[0]
outer = numpy.outer
def G(n):
return sum(v[:, m] * V.T[n].dot(v[:, m]) / (w[n] - w[m])
for m in xrange(N) if m != n)
g = sum(outer(v[:, n], v[:, n] * W[n] + G(n))
for n in xrange(N))
# Numpy's eigh(a, 'L') (eigh(a, 'U')) is a function of tril(a)
# (triu(a)) only. This means that partial derivative of
# eigh(a, 'L') (eigh(a, 'U')) with respect to a[i,j] is zero
# for i < j (i > j). At the same time, non-zero components of
# the gradient must account for the fact that variation of the
# opposite triangle contributes to variation of two elements
# of Hermitian (symmetric) matrix. The following line
# implements the necessary logic.
out = self.tri0(g) + self.tri1(g).T
# The call to self.tri0 in perform upcast from float32 to
# float64 or from int* to int64 in numpy 1.6.1 but not in
# 1.6.2. We do not want version dependent dtype in Theano.
# We think it should be the same as the output.
outputs[0][0] = numpy.asarray(out, dtype=node.outputs[0].dtype)
def grad(self, inputs, output_gradients):
num_ins = len(inputs)
if num_ins == 3:
x, v, sorter = inputs
else:
x, v = inputs
x_grad = gradient._float_zeros_like(x)
v_grad = gradient._float_zeros_like(v)
if num_ins == 3:
return [x_grad, v_grad, disconnected_type()]
else:
return [x_grad, v_grad]
def grad(self, inp, cost_grad):
"""
Notes
-----
The gradient is currently implemented for matrices only.
"""
a, val, offset = inp
grad = cost_grad[0]
height, width = grad.shape
if (a.dtype.startswith('complex')):
return [None, None]
# only valid for matrices
wr_a = fill_diagonal_offset(grad, 0, offset)
offset_abs = basic.abs_(offset)
pos_offset_flag = basic.ge(offset, 0)
neg_offset_flag = basic.lt(offset, 0)
min_wh = basic.minimum(width, height)
start = offset * pos_offset_flag + offset_abs * width * neg_offset_flag
num_of_step = basic.minimum(min_wh, width * pos_offset_flag +
height * neg_offset_flag - offset_abs)
step = a.shape[1] + 1
end = start + step * num_of_step
# input of slice should be integer
start = basic.cast(start, 'int32')
step = basic.cast(step, 'int32')
end = basic.cast(end, 'int32')
wr_val = grad.flatten()[start:end:step].sum()
wr_offset = theano.gradient.grad_undefined(
self, 2, offset,
"offset is not defined for non-integer offset so"
" fill_diagonal_offset(a,val,offset+eps) is undefined")
return [wr_a, wr_val, wr_offset]