def grad(self, inp, grads):
s, = inp
dt, = grads
if s.type.dtype in float_dtypes:
assert dt.type.dtype in float_dtypes
return [scalar_from_tensor(dt)]
# If the input dtype is an integer, then so is the output dtype,
# and the "zero" gradient can be represented in that int dtype.
# Currently, theano.grad insists that the dtype of the returned
# gradient has a float dtype, so we use floatX.
if s.type.dtype in discrete_dtypes:
return [s.zeros_like().astype(theano.config.floatX)]
raise NotImplementedError("grad not implemented for complex dtypes")
python类gradient()的实例源码
def clip(x, min, max):
"""
Clip x to be between min and max.
Notes
-----
When `x` is equal to the boundaries, the output is considered
to be `x`, so at these points, the gradient of the cost wrt the output
will be propagated to `x`, not to `min` nor `max`. In other words,
on these points, the gradient wrt `x` will be equal to the gradient wrt
the output, and the gradient wrt `min` and `max` will be zero.
"""
# see decorator for function body
# for grep: clamp, bound
def grad(self, axis_and_tensors, grads):
""" The gradient wrt a join op is a `Split`, used to partition
the gradient along the `axis` which was used for joining.
"""
gz, = grads
axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
rval = [grad_undefined(self, 0, axis)]
dtypes = [as_tensor_variable(x).type.dtype for x in tensors]
out_dtype = scal.upcast(*dtypes)
if 'float' in out_dtype or 'complex' in out_dtype:
# assume that this is differentiable
split = Split(len(tensors))
split_gz = split(gz, axis, stack([shape(x)[axis]
for x in tensors]))
# If there is only one split, it might not be in a list.
if not isinstance(split_gz, list):
split_gz = [split_gz]
# Split.make_node isn't always able to infer the right
# broadcast. As the grad need to keep the information,
# read it if needed.
split_gz = [patternbroadcast(g, t.broadcastable)
for t, g in zip(tensors, split_gz)]
rval = rval + split_gz
else:
# the output has integer type, so the gradient through it
# is 0
rval = rval + [tensor.zeros_like(dtype=config.floatX)
for tensor in tensors]
return rval
def grad(self, inputs, output_gradients):
# If the output is of an integer dtype, no gradient shall pass
if 'int' in self.dtype:
return [ipt.zeros_like().astype(theano.config.floatX)
for ipt in inputs]
grads = []
for i, inp in enumerate(inputs):
grads.append(output_gradients[0][i])
return grads
def local_grad_clip(node):
if isinstance(node.op, theano.gradient.GradClip):
return node.inputs
def R_op(self, inputs, eval_points):
outs = self(*inputs, **dict(return_list=True))
rval = [None for x in outs]
# For each output
for idx, out in enumerate(outs):
# make such that _bgrads computes only the gradients of the
# current output on the inputs ( and not all outputs)
ograds = [x.zeros_like() for x in outs]
ograds[idx] = theano.tensor.ones_like(out)
bgrads = self._bgrad(inputs, ograds)
rop_out = None
for jdx, (inp, eval_point) in enumerate(izip(inputs,
eval_points)):
# if None, then we can just ignore this branch ..
# what we do is to assume that for any non-differentiable
# branch, the gradient is actually 0, which I think is not
# the right thing to do .. have to talk to Ian and James
# about it
if bgrads[jdx] is None or \
isinstance(bgrads[jdx].type, DisconnectedType):
pass
elif eval_point is not None:
if rop_out is None:
rop_out = bgrads[jdx] * eval_point
else:
rop_out = rop_out + bgrads[jdx] * eval_point
rval[idx] = rop_out
return rval
def grad(self, inp, grads):
dy, sm, y_idx = inp
g_dx, = grads
# TODO: currently we do not compute the gradient w.r.t. dy, because
# advanced indexing is not working yet. When it works, do it to avoid
# potentially misleading behavior in gradient computations! (although
# typically we should not need the gradient w.r.t. dy).
y_idx_range = tensor.arange(y_idx.shape[0])
g_dy = tensor.sum(
g_dx * subtensor.AdvancedIncSubtensor()(
sm, tensor.fill(dy, -1), y_idx_range, y_idx), axis=1)
g_sm = dy.dimshuffle(0, 'x') * g_dx
g_y_idx = grad_not_implemented(self, 2, y_idx)
return [g_dy, g_sm, g_y_idx]
def grad(self, inp, grads):
img, ws, stride, pad = inp
grad, = grads
grad = gpu_contiguous(grad)
out = self(img, ws, stride, pad)
g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
def grad(self, inputs, gout):
(x, y) = inputs
(gz,) = gout
if gz.type in complex_types:
# min is currently defined for complex_types,
# but the gradient for complex is not.
raise NotImplementedError()
output = minimum(x, y)
if output.type in discrete_types:
return [x.zeros_like().astype(theano.config.floatX),
y.zeros_like().astype(theano.config.floatX)]
gx = eq(output, x) * gz
gy = eq(output, y) * gz
return (gx, gy)
def grad(self, inputs, gout):
(x, y) = inputs
(gz,) = gout
z = self(x, y)
if z.type.dtype in discrete_types:
# The gradient does not flow in if the output is discrete
return [x.zeros_like(dtype=theano.config.floatX),
y.zeros_like(dtype=theano.config.floatX)]
return [gz,
-(x // y) * gz]
def __str__(self):
# args may have been inserted by e.g. makeTester
args_msg = ", ".join(str(a) for a in self.args)
return """\
GradientError: numeric gradient and analytic gradient exceed tolerance:
At position %i of argument %i,
abs. error = %f, abs. tolerance = %f
rel. error = %f, rel. tolerance = %f
Exception args: %s""" % (self.err_pos, self.arg,
self.abs_err, self.abs_tol,
self.rel_err, self.rel_tol,
args_msg)
def grad_clip(x, lower_bound, upper_bound):
"""
This op do a view in the forward, but clip the gradient.
This is an elemwise operation.
:param x: the variable we want its gradient inputs clipped
:param lower_bound: The lower bound of the gradient value
:param upper_bound: The upper bound of the gradient value.
:examples:
x = theano.tensor.scalar()
z = theano.tensor.grad(grad_clip(x, -1, 1)**2, x)
z2 = theano.tensor.grad(x**2, x)
f = theano.function([x], outputs = [z, z2])
print(f(2.0)) # output (1.0, 4.0)
:note: We register an opt in tensor/opt.py that remove the GradClip.
So it have 0 cost in the forward and only do work in the grad.
"""
return GradClip(lower_bound, upper_bound)(x)
def structured_dot(x, y):
"""
Structured Dot is like dot, except that only the
gradient wrt non-zero elements of the sparse matrix
`a` are calculated and propagated.
The output is presumed to be a dense matrix, and is represented by a
TensorType instance.
Parameters
----------
a
A sparse matrix.
b
A sparse or dense matrix.
Returns
-------
A sparse matrix
The dot product of `a` and `b`.
Notes
-----
The grad implemented is structured.
"""
# @todo: Maybe the triple-transposition formulation (when x is dense)
# is slow. See if there is a direct way to do this.
# (JB 20090528: Transposing tensors and sparse matrices is constant-time,
# inplace, and fast.)
if hasattr(x, 'getnnz'):
x = as_sparse_variable(x)
assert x.format in ["csr", "csc"]
if hasattr(y, 'getnnz'):
y = as_sparse_variable(y)
assert y.format in ["csr", "csc"]
x_is_sparse_variable = _is_sparse_variable(x)
y_is_sparse_variable = _is_sparse_variable(y)
if not x_is_sparse_variable and not y_is_sparse_variable:
raise TypeError('structured_dot requires at least one sparse argument')
if x_is_sparse_variable:
return _structured_dot(x, y)
else:
assert y_is_sparse_variable
return _structured_dot(y.T, x.T).T
def grad(self, inp, grads):
# The strict sense mathematical gradient of the maximum function is
# not calculated here for it is not defined at every point where some
# coordinates are identical. However, since the latter set has null
# Lebesgue measure, the result may be interpreted as weak gradient.
# @note: This function should work correctly for L{vector}s.
# (x, y), (gz, gw)
# gz*dz/dx + gw*dw/dx, gz*dz/dy + gw*dw/dy
# gMax * dMax/dx + gArgMax * dArgMax/dx,
# gMax * dMax/daxis + gArgMax * dArgMax/daxis
# g_max has one less dimension than x, so you need to complete
# g_max to x's shape when axis=0 the broadcasting mechanism
# does it automatically
x, axis = inp
g_max, g_max_idx = grads
g_max_disconnected = isinstance(g_max.type, DisconnectedType)
g_max_idx_disconnected = isinstance(g_max_idx.type, DisconnectedType)
# if the op is totally disconnected, so are its inputs
if g_max_disconnected and g_max_idx_disconnected:
return [DisconnectedType()(), DisconnectedType()()]
axis_grad = grad_undefined(
self, 1, axis,
"argmax is not defined for non-integer axes so"
" argmax(x, axis+eps) is undefined")
# if the max is disconnected but the argmax is not,
# the gradient on its inputs is zero
if g_max_disconnected:
return [x.zeros_like(), axis_grad]
if NoneConst.equals(axis):
axis_ = list(range(x.ndim))
else:
axis_ = axis
xmax = max(x, axis_)
# Raise the g_max and xmax to the same number of dim as the input.
pattern = []
out_dim = 0
if NoneConst.equals(axis):
# We are taking the max/argmax over all dimensions.
axis = None
for i in xrange(x.ndim):
if axis is None or i in axis.data:
pattern.append('x')
else:
pattern.append(out_dim)
out_dim += 1
g_max_pad = DimShuffle(g_max.broadcastable, pattern)(g_max)
xmax_pad = DimShuffle(xmax.broadcastable, pattern)(xmax)
# Set the grad to the correct position.
g_x = eq(xmax_pad, x) * g_max_pad
return g_x, axis_grad
def flatten(x, outdim=1):
"""
Reshapes the variable x by keeping
the first outdim-1 dimension size(s) of x the same,
and making the last dimension size of x equal to
the multiplication of its remaining dimension size(s).
Parameters
----------
x : theano.tensor.var.TensorVariable
the variable that should be reshaped.
outdim : int
the number of dimensions of the returned variable
Returns
-------
theano.tensor.var.TensorVariable
the flattend variable with dimensionality of outdim
"""
# Any input variable can be flattened to have outdim of 1,
# even if it's a scalar. Otherwise, outdim must be positive
# and smaller than x.ndim.
if outdim < 1 or (outdim > 1 and outdim > x.ndim):
raise ValueError('outdim %s out of bound [1, %d)'
% (outdim, x.ndim + 1))
if outdim > 1:
dims = tuple(x.shape[:outdim - 1]) + (-1,)
else:
dims = (-1,)
x_reshaped = x.reshape(dims)
bcast_kept_dims = x.broadcastable[:outdim - 1]
bcast_new_dim = python_all(x.broadcastable[outdim - 1:])
broadcastable = bcast_kept_dims + (bcast_new_dim,)
x_reshaped = theano.tensor.addbroadcast(
x_reshaped, *filter(lambda i: broadcastable[i], range(outdim)))
return x_reshaped
# class TileGrad(Op):
# """
# Calculates the gradient of the Tile Op.
# """
# # this is so weird, I can't think of how to make this a general thing.
# def make_node(self, x, reps, g_out):
# return gof.Apply(self, [x, reps, g_out], [x.type()])
#
# def perform(self, node, inp, out):
# x, reps, g_out = inp
# gx, = out
# xsh = x.shape
# if len(reps) == 2 and reps[1] == 1 and len(x.shape) == 1:
# gx[0] = numpy.sum(g_out, axis=0)
# else:
# raise NotImplementedError('x.shape, reps combination not '
# 'supported', (x.shape, reps))
#
# tilegrad = TileGrad()
def grad(self, inp, grads):
x, y, inverse = inp
gz, = grads
# First, compute the gradient wrt the broadcasted x.
# If 'inverse' is False (0), apply the inverse of y on gz.
# Else, apply y on gz.
gx = permute_row_elements(gz, y, eq(inverse, 0))
# If x has been broadcasted along some axes, we need to sum
# the gradient over these axes, but keep the dimension (as
# broadcastable)
broadcasted_dims = [dim for dim in xrange(gz.type.ndim)
if x.type.broadcastable[dim] and
not gz.type.broadcastable[dim]]
gx = Sum(axis=broadcasted_dims)(gx)
# Sum(...) removed the dimensions in broadcasted_dims,
# so we need to put them back.
newdims = []
i = 0
for dim in xrange(gz.type.ndim):
if dim in broadcasted_dims:
newdims.append('x')
else:
newdims.append(i)
i += 1
gx = DimShuffle(gx.type.broadcastable, newdims)(gx)
assert gx.type.broadcastable == x.type.broadcastable
# if x is an integer type, then so is the output.
# this means f(x+eps) = f(x) so the gradient with respect
# to x is zero
if x.type.dtype.find('int') != -1:
gx = x.zeros_like()
# The elements of y and of inverse both affect the output,
# so they are connected to the output,
# and the transformation isn't defined if their values
# are non-integer, so the gradient with respect to them is
# undefined
return [gx, grad_undefined(self, 1, y),
grad_undefined(self, 1, inverse)]
def _bgrad(self, inputs, ograds):
# returns grad, with respect to broadcasted versions of inputs
prev_setting = theano.config.compute_test_value
try:
theano.config.compute_test_value = 'off'
def as_scalar(t):
if isinstance(t.type, (NullType, DisconnectedType)):
return t
return get_scalar_type(t.type.dtype)()
scalar_inputs = list(map(as_scalar, inputs))
scalar_ograds = list(map(as_scalar, ograds))
scalar_igrads = self.scalar_op.grad(scalar_inputs, scalar_ograds)
for igrad in scalar_igrads:
assert igrad is not None, self.scalar_op
finally:
theano.config.compute_test_value = prev_setting
if not isinstance(scalar_igrads, (list, tuple)):
raise TypeError('%s.grad returned %s instead of list or tuple' %
(str(self.scalar_op), str(type(scalar_igrads))))
nd = len(inputs[0].type.broadcastable) # this is the same for everyone
def transform(r):
# From a graph of ScalarOps, make a graph of Broadcast ops.
if isinstance(r.type, (NullType, DisconnectedType)):
return r
if r in scalar_inputs:
return inputs[scalar_inputs.index(r)]
if r in scalar_ograds:
return ograds[scalar_ograds.index(r)]
node = r.owner
if node is None:
# the gradient contains a constant, translate it as
# an equivalent TensorType of size 1 and proper number of
# dimensions
res = theano.tensor.constant(numpy.asarray(r.data),
dtype=r.type.dtype)
return DimShuffle((), ['x'] * nd)(res)
new_r = Elemwise(node.op, {})(
*[transform(ipt) for ipt in node.inputs])
return new_r
ret = []
for scalar_igrad, ipt in izip(scalar_igrads, inputs):
if scalar_igrad is None:
# undefined gradient
ret.append(None)
continue
ret.append(transform(scalar_igrad))
return ret
def grad(self, inputs, output_gradients):
V, W, b, d = inputs
dCdH, = output_gradients
# make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads
# print dCdH.broadcastable
# print "dCdH.broadcastable"
# quit(-1)
# dCdH = printing.Print("dCdH = ",["shape"])
# Make sure the broadcasting pattern of the gradient is the the same
# as the initial variable
dCdV = theano.tensor.nnet.convTransp3D(
W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4])
dCdV = T.patternbroadcast(dCdV, V.broadcastable)
WShape = W.shape
dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH)
dCdW = T.patternbroadcast(dCdW, W.broadcastable)
dCdb = T.sum(dCdH, axis=(0, 1, 2, 3))
dCdb = T.patternbroadcast(dCdb, b.broadcastable)
dCdd = grad_undefined(
self, 3, inputs[3],
"The gradient of Conv3D with respect to the convolution"
" stride is undefined because Conv3D is only defined for"
" integer strides.")
if 'name' in dir(dCdH) and dCdH.name is not None:
dCdH_name = dCdH.name
else:
dCdH_name = 'anon_dCdH'
if 'name' in dir(V) and V.name is not None:
V_name = V.name
else:
V_name = 'anon_V'
if 'name' in dir(W) and W.name is not None:
W_name = W.name
else:
W_name = 'anon_W'
if 'name' in dir(b) and b.name is not None:
b_name = b.name
else:
b_name = 'anon_b'
dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')'
dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name +
',W=' + W_name + ')')
dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name +
',W=' + W_name + ',b=' + b_name + ')')
return [dCdV, dCdW, dCdb, dCdd]
def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
"""
Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
an `alloc` of a scalar variable or one that has either broadcastable or
matching dimensions with the output variable, by one that skips the
intermediate `alloc`.
"""
if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx):
dy, sm, y_idx = node.inputs
# Those cases are directly handled by the internal broadcasting of the
# `CrossentropySoftmax1HotWithBiasDx` op.
if dy.ndim == 0:
return False
if dy.ndim == 1 and dy.broadcastable[0]:
return False
assert dy.ndim == 1
if dy.owner is not None and isinstance(dy.owner.op, tensor.Alloc):
# dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>)
dz = dy.owner.inputs[0]
try:
shape_feature = node.fgraph.shape_feature
except AttributeError:
# The shape feature may not be available in some mode, but we
# need it for this optimization, so don't continue.
return False
shape_of = shape_feature.shape_of
same_shape = shape_feature.same_shape
# Build `dz_broad` explicitly to include extra implicit dimensions.
dz_broad = (True,) * (dy.ndim - dz.ndim) + dz.broadcastable
# If we can infer statically that the shape of `sm` and
# `dy` are the same in dimension `k` or the shape of `dy` is equal
# to 1 (which triggers the internal broadcasting in
# `CrossentropySoftmax1HotWithBiasDx`) we do not need to
# check it at runtime.
if (dz_broad[0] and
not same_shape(sm, dy, dim_x=0, dim_y=0) and
shape_of[dy][0] != 1):
# If `dz` is broadcastable, we need to check whether the shapes
# of `dy` and `sm` are the same or whether the shape of `dy` is
# equal to 1.
cond = tensor.or_(tensor.eq(dy.shape[0], 1),
tensor.eq(dy.shape[0], sm.shape[0]))
msg = '`sm` and `dy` do not have the same shape.'
dz = opt.Assert(msg)(dz, cond)
ret = node.op(dz, sm, y_idx)
copy_stack_trace(node.outputs[0], ret)
return [ret]
def relu(x, alpha=0):
"""
Compute the element-wise rectified linear activation function.
.. versionadded:: 0.7.1
Parameters
----------
x : symbolic tensor
Tensor to compute the activation function for.
alpha : scalar or tensor, optional
Slope for negative input, usually between 0 and 1. The default value
of 0 will lead to the standard rectifier, 1 will lead to
a linear activation function, and any value in between will give a
leaky rectifier. A shared variable (broadcastable against `x`) will
result in a parameterized rectifier with learnable slope(s).
Returns
-------
symbolic tensor
Element-wise rectifier applied to `x`.
Notes
-----
This is numerically equivalent to ``T.switch(x > 0, x, alpha * x)``
(or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster
formulation or an optimized Op, so we encourage to use this function.
"""
# This is probably the fastest implementation for GPUs. Both the forward
# pass and the gradient get compiled into a single GpuElemwise call.
# TODO: Check if it's optimal for CPU as well; add an "if" clause if not.
# TODO: Check if there's a faster way for the gradient; create an Op if so.
if alpha == 0:
return 0.5 * (x + abs(x))
else:
# We can't use 0.5 and 1 for one and half. as if alpha is a
# numpy dtype, they will be considered as float64, so would
# cause upcast to float64.
alpha = tensor.as_tensor_variable(alpha)
f1 = 0.5 * (1 + alpha)
f2 = 0.5 * (1 - alpha)
return f1 * x + f2 * abs(x)