python类gradient()的实例源码-第2页-面圈网

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def grad(self, inp, grads):
        s, = inp
        dt, = grads
        if s.type.dtype in float_dtypes:
            assert dt.type.dtype in float_dtypes
            return [scalar_from_tensor(dt)]

        # If the input dtype is an integer, then so is the output dtype,
        # and the "zero" gradient can be represented in that int dtype.
        # Currently, theano.grad insists that the dtype of the returned
        # gradient has a float dtype, so we use floatX.
        if s.type.dtype in discrete_dtypes:
            return [s.zeros_like().astype(theano.config.floatX)]

        raise NotImplementedError("grad not implemented for complex dtypes")

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def clip(x, min, max):
    """
    Clip x to be between min and max.

    Notes
    -----
    When `x` is equal to the boundaries, the output is considered
    to be `x`, so at these points, the gradient of the cost wrt the output
    will be propagated to `x`, not to `min` nor `max`. In other words,
    on these points, the gradient wrt `x` will be equal to the gradient wrt
    the output, and the gradient wrt `min` and `max` will be zero.

    """
    # see decorator for function body
    # for grep: clamp, bound

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def grad(self, axis_and_tensors, grads):
        """ The gradient wrt a join op is a `Split`, used to partition
        the gradient along the `axis` which was used for joining.
        """
        gz, = grads
        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]

        rval = [grad_undefined(self, 0, axis)]

        dtypes = [as_tensor_variable(x).type.dtype for x in tensors]
        out_dtype = scal.upcast(*dtypes)

        if 'float' in out_dtype or 'complex' in out_dtype:
            # assume that this is differentiable
            split = Split(len(tensors))
            split_gz = split(gz, axis, stack([shape(x)[axis]
                                              for x in tensors]))
            # If there is only one split, it might not be in a list.
            if not isinstance(split_gz, list):
                split_gz = [split_gz]
            # Split.make_node isn't always able to infer the right
            # broadcast. As the grad need to keep the information,
            # read it if needed.
            split_gz = [patternbroadcast(g, t.broadcastable)
                        for t, g in zip(tensors, split_gz)]
            rval = rval + split_gz
        else:
            # the output has integer type, so the gradient through it
            # is 0
            rval = rval + [tensor.zeros_like(dtype=config.floatX)
                           for tensor in tensors]

        return rval

opt.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def grad(self, inputs, output_gradients):
        # If the output is of an integer dtype, no gradient shall pass
        if 'int' in self.dtype:
            return [ipt.zeros_like().astype(theano.config.floatX)
                    for ipt in inputs]

        grads = []
        for i, inp in enumerate(inputs):
            grads.append(output_gradients[0][i])
        return grads

opt.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def local_grad_clip(node):
    if isinstance(node.op, theano.gradient.GradClip):
        return node.inputs

elemwise.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def R_op(self, inputs, eval_points):
        outs = self(*inputs, **dict(return_list=True))
        rval = [None for x in outs]
        # For each output
        for idx, out in enumerate(outs):
            # make such that _bgrads computes only the gradients of the
            # current output on the inputs ( and not all outputs)
            ograds = [x.zeros_like() for x in outs]
            ograds[idx] = theano.tensor.ones_like(out)

            bgrads = self._bgrad(inputs, ograds)
            rop_out = None

            for jdx, (inp, eval_point) in enumerate(izip(inputs,
                                                    eval_points)):
                # if None, then we can just ignore this branch ..
                # what we do is to assume that for any non-differentiable
                # branch, the gradient is actually 0, which I think is not
                # the right thing to do .. have to talk to Ian and James
                # about it

                if bgrads[jdx] is None or \
                        isinstance(bgrads[jdx].type, DisconnectedType):
                    pass
                elif eval_point is not None:
                    if rop_out is None:
                        rop_out = bgrads[jdx] * eval_point
                    else:
                        rop_out = rop_out + bgrads[jdx] * eval_point

            rval[idx] = rop_out

        return rval

nnet.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def grad(self, inp, grads):
        dy, sm, y_idx = inp
        g_dx, = grads
        # TODO: currently we do not compute the gradient w.r.t. dy, because
        # advanced indexing is not working yet. When it works, do it to avoid
        # potentially misleading behavior in gradient computations! (although
        # typically we should not need the gradient w.r.t. dy).
        y_idx_range = tensor.arange(y_idx.shape[0])
        g_dy = tensor.sum(
            g_dx * subtensor.AdvancedIncSubtensor()(
                sm, tensor.fill(dy, -1), y_idx_range, y_idx), axis=1)
        g_sm = dy.dimshuffle(0, 'x') * g_dx
        g_y_idx = grad_not_implemented(self, 2, y_idx)
        return [g_dy, g_sm, g_y_idx]

dnn.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def grad(self, inp, grads):
        img, ws, stride, pad = inp
        grad, = grads

        grad = gpu_contiguous(grad)

        out = self(img, ws, stride, pad)

        g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)

        return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def grad(self, inputs, gout):
        (x, y) = inputs
        (gz,) = gout
        if gz.type in complex_types:
            # min is currently defined for complex_types,
            # but the gradient for complex is not.
            raise NotImplementedError()

        output = minimum(x, y)
        if output.type in discrete_types:
            return [x.zeros_like().astype(theano.config.floatX),
                    y.zeros_like().astype(theano.config.floatX)]
        gx = eq(output, x) * gz
        gy = eq(output, y) * gz
        return (gx, gy)

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def grad(self, inputs, gout):
        (x, y) = inputs
        (gz,) = gout
        z = self(x, y)
        if z.type.dtype in discrete_types:
            # The gradient does not flow in if the output is discrete
            return [x.zeros_like(dtype=theano.config.floatX),
                    y.zeros_like(dtype=theano.config.floatX)]
        return [gz,
                -(x // y) * gz]

gradient.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def __str__(self):
        # args may have been inserted by e.g. makeTester
        args_msg = ", ".join(str(a) for a in self.args)
        return """\
GradientError: numeric gradient and analytic gradient exceed tolerance:
        At position %i of argument %i,
            abs. error = %f,  abs. tolerance = %f
            rel. error = %f,  rel. tolerance = %f
Exception args: %s""" % (self.err_pos, self.arg,
                         self.abs_err, self.abs_tol,
                         self.rel_err, self.rel_tol,
                         args_msg)

gradient.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def grad_clip(x, lower_bound, upper_bound):
    """
    This op do a view in the forward, but clip the gradient.

    This is an elemwise operation.

    :param x: the variable we want its gradient inputs clipped
    :param lower_bound: The lower bound of the gradient value
    :param upper_bound: The upper bound of the gradient value.

    :examples:

        x = theano.tensor.scalar()

        z = theano.tensor.grad(grad_clip(x, -1, 1)**2, x)
        z2 = theano.tensor.grad(x**2, x)

        f = theano.function([x], outputs = [z, z2])

        print(f(2.0))  # output (1.0, 4.0)

    :note: We register an opt in tensor/opt.py that remove the GradClip.
       So it have 0 cost in the forward and only do work in the grad.

    """
    return GradClip(lower_bound, upper_bound)(x)

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def structured_dot(x, y):
    """
    Structured Dot is like dot, except that only the
    gradient wrt non-zero elements of the sparse matrix
    `a` are calculated and propagated.

    The output is presumed to be a dense matrix, and is represented by a
    TensorType instance.

    Parameters
    ----------
    a
        A sparse matrix.
    b
        A sparse or dense matrix.

    Returns
    -------
    A sparse matrix
        The dot product of `a` and `b`.

    Notes
    -----
    The grad implemented is structured.

    """

    # @todo: Maybe the triple-transposition formulation (when x is dense)
    # is slow. See if there is a direct way to do this.
    # (JB 20090528: Transposing tensors and sparse matrices is constant-time,
    # inplace, and fast.)

    if hasattr(x, 'getnnz'):
        x = as_sparse_variable(x)
        assert x.format in ["csr", "csc"]
    if hasattr(y, 'getnnz'):
        y = as_sparse_variable(y)
        assert y.format in ["csr", "csc"]

    x_is_sparse_variable = _is_sparse_variable(x)
    y_is_sparse_variable = _is_sparse_variable(y)
    if not x_is_sparse_variable and not y_is_sparse_variable:
        raise TypeError('structured_dot requires at least one sparse argument')

    if x_is_sparse_variable:
        return _structured_dot(x, y)
    else:
        assert y_is_sparse_variable
        return _structured_dot(y.T, x.T).T

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def grad(self, inp, grads):
        # The strict sense mathematical gradient of the maximum function is
        # not calculated here for it is not defined at every point where some
        # coordinates are identical. However, since the latter set has null
        # Lebesgue measure, the result may be interpreted as weak gradient.

        # @note: This function should work correctly for L{vector}s.
        # (x, y), (gz, gw)
        # gz*dz/dx + gw*dw/dx, gz*dz/dy + gw*dw/dy
        # gMax * dMax/dx + gArgMax * dArgMax/dx,
        # gMax * dMax/daxis + gArgMax * dArgMax/daxis
        # g_max has one less dimension than x, so you need to complete
        # g_max to x's shape when axis=0 the broadcasting mechanism
        # does it automatically
        x, axis = inp
        g_max, g_max_idx = grads

        g_max_disconnected = isinstance(g_max.type, DisconnectedType)
        g_max_idx_disconnected = isinstance(g_max_idx.type, DisconnectedType)

        # if the op is totally disconnected, so are its inputs
        if g_max_disconnected and g_max_idx_disconnected:
            return [DisconnectedType()(), DisconnectedType()()]

        axis_grad = grad_undefined(
            self, 1, axis,
            "argmax is not defined for non-integer axes so"
            " argmax(x, axis+eps) is undefined")

        # if the max is disconnected but the argmax is not,
        # the gradient on its inputs is zero
        if g_max_disconnected:
            return [x.zeros_like(), axis_grad]
        if NoneConst.equals(axis):
            axis_ = list(range(x.ndim))
        else:
            axis_ = axis
        xmax = max(x, axis_)

        # Raise the g_max and xmax to the same number of dim as the input.
        pattern = []
        out_dim = 0
        if NoneConst.equals(axis):
            # We are taking the max/argmax over all dimensions.
            axis = None
        for i in xrange(x.ndim):
            if axis is None or i in axis.data:
                pattern.append('x')
            else:
                pattern.append(out_dim)
                out_dim += 1
        g_max_pad = DimShuffle(g_max.broadcastable, pattern)(g_max)
        xmax_pad = DimShuffle(xmax.broadcastable, pattern)(xmax)

        # Set the grad to the correct position.
        g_x = eq(xmax_pad, x) * g_max_pad
        return g_x, axis_grad

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def flatten(x, outdim=1):
    """
    Reshapes the variable x by keeping
    the first outdim-1 dimension size(s) of x the same,
    and making the last dimension size of x equal to
    the multiplication of its remaining dimension size(s).

    Parameters
    ----------
        x : theano.tensor.var.TensorVariable
            the variable that should be reshaped.

        outdim : int
            the number of dimensions of the returned variable

    Returns
    -------
    theano.tensor.var.TensorVariable
        the flattend variable with dimensionality of outdim
    """
    # Any input variable can be flattened to have outdim of 1,
    # even if it's a scalar. Otherwise, outdim must be positive
    # and smaller than x.ndim.
    if outdim < 1 or (outdim > 1 and outdim > x.ndim):
        raise ValueError('outdim %s out of bound [1, %d)'
                         % (outdim, x.ndim + 1))

    if outdim > 1:
        dims = tuple(x.shape[:outdim - 1]) + (-1,)
    else:
        dims = (-1,)
    x_reshaped = x.reshape(dims)
    bcast_kept_dims = x.broadcastable[:outdim - 1]
    bcast_new_dim = python_all(x.broadcastable[outdim - 1:])
    broadcastable = bcast_kept_dims + (bcast_new_dim,)
    x_reshaped = theano.tensor.addbroadcast(
        x_reshaped, *filter(lambda i: broadcastable[i], range(outdim)))
    return x_reshaped


# class TileGrad(Op):
#     """
#     Calculates the gradient of the Tile Op.
#     """
#     # this is so weird, I can't think of how to make this a general thing.
#     def make_node(self, x, reps, g_out):
#         return gof.Apply(self, [x, reps, g_out], [x.type()])
#
#     def perform(self, node, inp, out):
#         x, reps, g_out = inp
#         gx, = out
#         xsh = x.shape
#         if len(reps) == 2 and reps[1] == 1 and len(x.shape) == 1:
#             gx[0] = numpy.sum(g_out, axis=0)
#         else:
#             raise NotImplementedError('x.shape, reps combination not '
#                                       'supported', (x.shape, reps))
#
# tilegrad = TileGrad()

basic.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def grad(self, inp, grads):
        x, y, inverse = inp
        gz, = grads
        # First, compute the gradient wrt the broadcasted x.
        # If 'inverse' is False (0), apply the inverse of y on gz.
        # Else, apply y on gz.
        gx = permute_row_elements(gz, y, eq(inverse, 0))

        # If x has been broadcasted along some axes, we need to sum
        # the gradient over these axes, but keep the dimension (as
        # broadcastable)
        broadcasted_dims = [dim for dim in xrange(gz.type.ndim)
                            if x.type.broadcastable[dim] and
                            not gz.type.broadcastable[dim]]
        gx = Sum(axis=broadcasted_dims)(gx)

        # Sum(...) removed the dimensions in broadcasted_dims,
        # so we need to put them back.
        newdims = []
        i = 0
        for dim in xrange(gz.type.ndim):
            if dim in broadcasted_dims:
                newdims.append('x')
            else:
                newdims.append(i)
                i += 1

        gx = DimShuffle(gx.type.broadcastable, newdims)(gx)
        assert gx.type.broadcastable == x.type.broadcastable

        # if x is an integer type, then so is the output.
        # this means f(x+eps) = f(x) so the gradient with respect
        # to x is zero
        if x.type.dtype.find('int') != -1:
            gx = x.zeros_like()

        # The elements of y and of inverse both affect the output,
        # so they are connected to the output,
        # and the transformation isn't defined if their values
        # are non-integer, so the gradient with respect to them is
        # undefined

        return [gx, grad_undefined(self, 1, y),
                grad_undefined(self, 1, inverse)]

elemwise.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def _bgrad(self, inputs, ograds):
        # returns grad, with respect to broadcasted versions of inputs

        prev_setting = theano.config.compute_test_value

        try:

            theano.config.compute_test_value = 'off'

            def as_scalar(t):
                if isinstance(t.type, (NullType, DisconnectedType)):
                    return t
                return get_scalar_type(t.type.dtype)()

            scalar_inputs = list(map(as_scalar, inputs))
            scalar_ograds = list(map(as_scalar, ograds))
            scalar_igrads = self.scalar_op.grad(scalar_inputs, scalar_ograds)
            for igrad in scalar_igrads:
                assert igrad is not None, self.scalar_op

        finally:

            theano.config.compute_test_value = prev_setting

        if not isinstance(scalar_igrads, (list, tuple)):
            raise TypeError('%s.grad returned %s instead of list or tuple' %
                            (str(self.scalar_op), str(type(scalar_igrads))))

        nd = len(inputs[0].type.broadcastable)  # this is the same for everyone

        def transform(r):
            # From a graph of ScalarOps, make a graph of Broadcast ops.
            if isinstance(r.type, (NullType, DisconnectedType)):
                return r
            if r in scalar_inputs:
                return inputs[scalar_inputs.index(r)]
            if r in scalar_ograds:
                return ograds[scalar_ograds.index(r)]
            node = r.owner
            if node is None:
                # the gradient contains a constant, translate it as
                # an equivalent TensorType of size 1 and proper number of
                # dimensions
                res = theano.tensor.constant(numpy.asarray(r.data),
                                             dtype=r.type.dtype)
                return DimShuffle((), ['x'] * nd)(res)

            new_r = Elemwise(node.op, {})(
                *[transform(ipt) for ipt in node.inputs])
            return new_r
        ret = []
        for scalar_igrad, ipt in izip(scalar_igrads, inputs):
            if scalar_igrad is None:
                # undefined gradient
                ret.append(None)
                continue
            ret.append(transform(scalar_igrad))

        return ret

Conv3D.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def grad(self, inputs, output_gradients):
        V, W, b, d = inputs
        dCdH, = output_gradients
        # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads
        # print dCdH.broadcastable
        # print "dCdH.broadcastable"
        # quit(-1)
        # dCdH = printing.Print("dCdH = ",["shape"])

        # Make sure the broadcasting pattern of the gradient is the the same
        # as the initial variable
        dCdV = theano.tensor.nnet.convTransp3D(
            W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4])
        dCdV = T.patternbroadcast(dCdV, V.broadcastable)
        WShape = W.shape
        dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH)
        dCdW = T.patternbroadcast(dCdW, W.broadcastable)
        dCdb = T.sum(dCdH, axis=(0, 1, 2, 3))
        dCdb = T.patternbroadcast(dCdb, b.broadcastable)
        dCdd = grad_undefined(
            self, 3, inputs[3],
            "The gradient of Conv3D with respect to the convolution"
            " stride is undefined because Conv3D is only defined for"
            " integer strides.")

        if 'name' in dir(dCdH) and dCdH.name is not None:
            dCdH_name = dCdH.name
        else:
            dCdH_name = 'anon_dCdH'

        if 'name' in dir(V) and V.name is not None:
            V_name = V.name
        else:
            V_name = 'anon_V'

        if 'name' in dir(W) and W.name is not None:
            W_name = W.name
        else:
            W_name = 'anon_W'

        if 'name' in dir(b) and b.name is not None:
            b_name = b.name
        else:
            b_name = 'anon_b'

        dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')'
        dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name +
                     ',W=' + W_name + ')')
        dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name +
                     ',W=' + W_name + ',b=' + b_name + ')')

        return [dCdV, dCdW, dCdb, dCdd]

nnet.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
    """
    Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
    an `alloc` of a scalar variable or one that has either broadcastable or
    matching dimensions with the output variable, by one that skips the
    intermediate `alloc`.

    """
    if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx):
        dy, sm, y_idx = node.inputs

        # Those cases are directly handled by the internal broadcasting of the
        # `CrossentropySoftmax1HotWithBiasDx` op.
        if dy.ndim == 0:
            return False
        if dy.ndim == 1 and dy.broadcastable[0]:
            return False

        assert dy.ndim == 1

        if dy.owner is not None and isinstance(dy.owner.op, tensor.Alloc):
            # dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>)
            dz = dy.owner.inputs[0]

            try:
                shape_feature = node.fgraph.shape_feature
            except AttributeError:
                # The shape feature may not be available in some mode, but we
                # need it for this optimization, so don't continue.
                return False

            shape_of = shape_feature.shape_of
            same_shape = shape_feature.same_shape

            # Build `dz_broad` explicitly to include extra implicit dimensions.
            dz_broad = (True,) * (dy.ndim - dz.ndim) + dz.broadcastable

            # If we can infer statically that the shape of `sm` and
            # `dy` are the same in dimension `k` or the shape of `dy` is equal
            # to 1 (which triggers the internal broadcasting in
            # `CrossentropySoftmax1HotWithBiasDx`) we do not need to
            # check it at runtime.
            if (dz_broad[0] and
                    not same_shape(sm, dy, dim_x=0, dim_y=0) and
                    shape_of[dy][0] != 1):
                # If `dz` is broadcastable, we need to check whether the shapes
                # of `dy` and `sm` are the same or whether the shape of `dy` is
                # equal to 1.
                cond = tensor.or_(tensor.eq(dy.shape[0], 1),
                                  tensor.eq(dy.shape[0], sm.shape[0]))
                msg = '`sm` and `dy` do not have the same shape.'
                dz = opt.Assert(msg)(dz, cond)

            ret = node.op(dz, sm, y_idx)
            copy_stack_trace(node.outputs[0], ret)
            return [ret]

nnet.py 文件源码项目：Theano-Deep-learning 作者: GeekLiB 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def relu(x, alpha=0):
    """
    Compute the element-wise rectified linear activation function.

    .. versionadded:: 0.7.1

    Parameters
    ----------
    x : symbolic tensor
        Tensor to compute the activation function for.
    alpha : scalar or tensor, optional
        Slope for negative input, usually between 0 and 1. The default value
        of 0 will lead to the standard rectifier, 1 will lead to
        a linear activation function, and any value in between will give a
        leaky rectifier. A shared variable (broadcastable against `x`) will
        result in a parameterized rectifier with learnable slope(s).

    Returns
    -------
    symbolic tensor
        Element-wise rectifier applied to `x`.

    Notes
    -----
    This is numerically equivalent to ``T.switch(x > 0, x, alpha * x)``
    (or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster
    formulation or an optimized Op, so we encourage to use this function.

    """
    # This is probably the fastest implementation for GPUs. Both the forward
    # pass and the gradient get compiled into a single GpuElemwise call.
    # TODO: Check if it's optimal for CPU as well; add an "if" clause if not.
    # TODO: Check if there's a faster way for the gradient; create an Op if so.
    if alpha == 0:
        return 0.5 * (x + abs(x))
    else:
        # We can't use 0.5 and 1 for one and half.  as if alpha is a
        # numpy dtype, they will be considered as float64, so would
        # cause upcast to float64.
        alpha = tensor.as_tensor_variable(alpha)
        f1 = 0.5 * (1 + alpha)
        f2 = 0.5 * (1 - alpha)
        return f1 * x + f2 * abs(x)