def print_graph_linker(print_prog=True):
if 1:
imap = {None:'-'}
def blah(i, node, thunk):
imap[node] = str(i)
if print_prog:# and node.op.__class__ is T.DimShuffle:
if False and node.op == T.DimShuffle((), ['x', 'x'], inplace = True):
print(node.op == T.DimShuffle((), ['x', 'x'],
inplace=True), end=' ')
print(node.inputs[0], type(node.inputs[0]), end=' ')
print(node.inputs[0].equals(T.constant(2)), end=' ')
outputs = node.outputs
inputs = theano.gof.graph.inputs(outputs)
print('node ', i, node, end=' ')
print(':'.join([imap[inp.owner] for inp in node.inputs]))
#print theano.sandbox.pprint.pp.process_graph(inputs, outputs)
return theano.sandbox.wraplinker.WrapLinkerMany(
[theano.gof.OpWiseCLinker()],
[theano.sandbox.wraplinker.run_all
,blah
#,theano.sandbox.wraplinker.numpy_notall_isfinite
])
else:
return theano.gof.OpWiseCLinker()
python类sandbox()的实例源码
def test_output_broadcast_cuda(self):
from theano.sandbox import cuda
if not cuda.cuda_available:
raise SkipTest("Optional package Cuda disabled")
if cuda.use.device_number is None:
# We should normally set VecAsRowAndCol as a GPUOp But we
# don't want to do this here as this will disable others
# tests in this file. So we manually init the GPU if
# needed to remove warning.
cuda.use("gpu",
force=True,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
v = cuda.fvector('v')
c, r = VecAsRowAndCol()(v)
f = theano.function([v], [c, r])
v_val = cuda.CudaNdarray(self.rng.randn(5).astype('float32'))
f(v_val)
def test_simple_shared_mrg_random(self):
theano_rng = theano.sandbox.rng_mrg.MRG_RandomStreams(utt.fetch_seed())
values, updates = theano.scan(lambda: theano_rng.uniform((2,), -1, 1),
[],
[],
[],
n_steps=5,
truncate_gradient=-1,
go_backwards=False)
my_f = theano.function([],
values,
updates=updates,
allow_input_downcast=True)
# Just check for run-time errors
theano_v = my_f()
theano_v = my_f()
def compile_sampling(self, data_train, data_valid, data_test, training_n_samples):
X = tt.matrix('X')
batch = tt.iscalar('batch')
n_samples = tt.iscalar('n_samples')
n_layers = len(self.layers)
samples = [None] * n_layers
samples[0] = replicate_batch(X, n_samples)
if "gpu" in theano.config.device:
from theano.sandbox import rng_mrg
srng = rng_mrg.MRG_RandomStreams(seed=42)
else:
srng = tt.shared_randomstreams.RandomStreams(seed=42)
for layer in range(n_layers - 1):
samples[layer + 1] = self.compute_samples(srng, samples[layer], layer)
givens = dict()
givens[X] = data_valid[batch * self.batch_size:(batch + 1) * self.batch_size]
self.sample_convergence = theano.function([batch, n_samples], samples, givens=givens)
givens[n_samples] = np.int32(training_n_samples)
givens[X] = data_train[batch * self.batch_size:(batch + 1) * self.batch_size]
self.sample_train = theano.function([batch], samples, givens=givens)
givens[X] = data_valid[batch * self.batch_size:(batch + 1) * self.batch_size]
self.sample_valid = theano.function([batch], samples, givens=givens)
givens[X] = data_test[batch * self.batch_size:(batch + 1) * self.batch_size]
self.sample_test = theano.function([batch], samples, givens=givens)
def time_theano_fn(fn, index, GPU_bool):
if GPU_bool:
theano.sandbox.cuda.synchronize()
start = time.time()*1000
fn(index)
if GPU_bool:
theano.sandbox.cuda.synchronize()
elapsed_time = time.time()*1000 - start
return elapsed_time
def print_mem(context=None):
if theano.sandbox.cuda.cuda_enabled:
rvals = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
# Avaliable memory in Mb
available = float(rvals[0]) / 1024. / 1024.
# Total memory in Mb
total = float(rvals[1]) / 1024. / 1024.
if context == None:
print ('Used %.3f Mb Free %.3f Mb, total %.3f Mb' %
(total - available, available, total))
else:
info = str(context)
print (('GPU status : Used %.3f Mb Free %.3f Mb,'
'total %.3f Mb [context %s]') %
(total - available, available, total, info))
def gpu_mem_free():
"""
Memory free on the GPU
Returns
-------
megs_free : float
Number of megabytes of memory free on the GPU used by Theano
"""
global cuda
if cuda is None:
from theano.sandbox import cuda
return cuda.mem_info()[0]/1024./1024
def print_mem(context=None):
if theano.sandbox.cuda.cuda_enabled:
rvals = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
# Avaliable memory in Mb
available = float(rvals[0]) / 1024. / 1024.
# Total memory in Mb
total = float(rvals[1]) / 1024. / 1024.
if context == None:
print ('Used %.3f Mb Free %.3f Mb, total %.3f Mb' %
(total - available, available, total))
else:
info = str(context)
print (('GPU status : Used %.3f Mb Free %.3f Mb,'
'total %.3f Mb [context %s]') %
(total - available, available, total, info))
def gpu_mem_free():
"""
Memory free on the GPU
Returns
-------
megs_free : float
Number of megabytes of memory free on the GPU used by Theano
"""
global cuda
if cuda is None:
from theano.sandbox import cuda
return cuda.mem_info()[0]/1024./1024
def test_multinomial_0():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
p = tensor.fmatrix()
u = tensor.fvector()
for dtype in ['int64', 'float32', 'auto']:
m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def test_multinomial_large():
# DEBUG_MODE will test this on GPU
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(10000 * 4,
dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
mval = f(pval, uval)
assert mval.shape == pval.shape
if config.cast_policy == 'custom':
assert mval.dtype == pval.dtype
elif config.cast_policy == 'numpy+floatX':
assert mval.dtype == config.floatX
elif config.cast_policy == 'numpy':
assert mval.dtype == 'float64'
else:
raise NotImplementedError(config.cast_policy)
utt.assert_allclose(mval.sum(axis=1), 2)
asdf = numpy.asarray([0, 0, 2, 0]) + 0 * pval
utt.assert_allclose(mval, asdf) # broadcast over all rows
def test_gpu_opt_dtypes():
# Test if the returned samples are of the datatype specified
for dtype in ['uint32', 'float32', 'int64', 'float64']:
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
samples = f(pval, uval)
assert samples.dtype == dtype, "%s != %s" % (samples.dtype, dtype)
def test_gpu_opt():
# Does have some overlap with test_multinomial_0
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
assert m.dtype == 'float32', m.dtype
f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
# Test with a row, it was failing in the past.
r = tensor.frow()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(r, u)
assert m.dtype == 'float32', m.dtype
f = function([r, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception(
"You are running the Theano profiler with CUDA enabled."
" Theano GPU ops execution is asynchronous by default."
" So by default, the profile is useless."
" You must set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to"
" synchronize the execution to get a meaningful profile.")
self.apply_callcount = {}
self.output_size = {}
self.apply_time = {}
self.apply_cimpl = {}
self.variable_shape = {}
self.variable_strides = {}
if flag_time_thunks is None:
self.flag_time_thunks = config.profiling.time_thunks
else:
self.flag_time_thunks = flag_time_thunks
self.__dict__.update(kwargs)
if atexit_print:
global _atexit_print_list
_atexit_print_list.append(self)
global _atexit_registered
if not _atexit_registered:
atexit.register(_atexit_print_fn)
_atexit_registered = True
self.ignore_first_call = theano.config.profiling.ignore_first_call
def __init__(self, *args, **kwargs):
from theano.sandbox import cuda
self.gpu_backend = cuda
self.mode_with_gpu = mode_with_gpu
self.mode_with_gpu_nodebug = mode_with_gpu_nodebug
super(T_Scan_Cuda, self).__init__(*args, **kwargs)
def test_consistent_inner_fct(self):
# Test that scan does not falsely detect inconsistencies in a valid
# inner graph
rs = theano.sandbox.rng_mrg.MRG_RandomStreams(use_cuda=True)
output, _ = theano.scan(lambda : rs.uniform((3,), dtype="float32"),
n_steps=3)
pickle.loads(pickle.dumps(output))
# Also ensure that, after compilation, the Scan has been moved
# on the gpu
fct = theano.function([], output, mode=self.mode_with_gpu)
scan_nodes = scan_nodes_from_fct(fct)
assert len(scan_nodes) == 1
assert self.is_scan_on_gpu(scan_nodes[0])
def test_n_samples_compatibility():
"""
This test checks if the new change to MultinomialFromUniform is still compatible
with old interface. Here I will load a graph created (using the old interface) as follows:
RandomStreams = theano.sandbox.rng_mrg.MRG_RandomStreams
th_rng = RandomStreams(12345)
X = T.matrix('X')
pvals = T.exp(X)
pvals = pvals / pvals.sum(axis=1, keepdims=True)
samples = th_rng.multinomial(pvals=pvals)
pickle.dump([X, samples], open("multinomial_test_graph.pkl", "w"))
"""
folder = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(folder, "multinomial_test_graph.pkl"),
"rb") as pkl_file:
if PY3:
u = CompatUnpickler(pkl_file, encoding="latin1")
else:
u = CompatUnpickler(pkl_file)
try:
X, samples = u.load()
except ImportError:
# Windows sometimes fail with nonsensical errors like:
# ImportError: No module named type
# ImportError: No module named copy_reg
# when "type" and "copy_reg" are builtin modules.
if sys.platform == 'win32':
exc_type, exc_value, exc_trace = sys.exc_info()
reraise(SkipTest, exc_value, exc_trace)
raise
f = theano.function([X], samples)
res = f(numpy.random.randn(20, 10))
assert numpy.all(res.sum(axis=1) == 1)
def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
R = MRG_RandomStreams(234, use_cuda=False)
u = R.binomial(size=size, p=mean)
f = theano.function(var_input, u, mode=mode)
out = f(*input)
# Increase the number of steps if sizes implies only a few samples
if numpy.prod(const_size) < 10:
steps_ = steps * 100
else:
steps_ = steps
basictest(f, steps_, const_size, prefix='mrg cpu',
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
if mode != 'FAST_COMPILE' and cuda_available:
R = MRG_RandomStreams(234, use_cuda=True)
u = R.binomial(size=size, p=mean, dtype='float32')
# well, it's really that this test w GPU doesn't make sense otw
assert u.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True), mode=mode_with_gpu)
gpu_out = numpy.asarray(f(*input))
basictest(f, steps_, const_size, prefix='mrg gpu',
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
numpy.testing.assert_array_almost_equal(out, gpu_out,
decimal=6)
RR = theano.tensor.shared_randomstreams.RandomStreams(234)
uu = RR.binomial(size=size, p=mean)
ff = theano.function(var_input, uu, mode=mode)
# It's not our problem if numpy generates 0 or 1
basictest(ff, steps_, const_size, prefix='numpy', allow_01=True,
inputs=input, target_avg=mean, mean_rtol=rtol)
def gemm_conv_op(img, kern, border_mode):
kern = theano.sandbox.cuda.basic_ops.gpu_contiguous(
kern[:, :, ::-1, ::-1])
y = theano.sandbox.cuda.blas.GpuCorrMM(border_mode=border_mode)(
img, kern)
return y
def gemm_op(mode, subsample):
return theano.sandbox.cuda.blas.GpuCorrMM(mode, subsample)
def test_viewop_gpu():
from theano.sandbox import cuda
if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
_x = theano.tensor.fvector('x')
x = cuda.gpu_from_host(_x)
_out = theano.compile.ViewOp()(x)
out = cuda.host_from_gpu(_out)
f = theano.function([x],
out,
mode=mode_with_gpu)
data = numpy.array([1, 2, 3], dtype='float32')
assert numpy.allclose(f(data), data)
def contains_inf(arr, node=None, var=None):
"""
Test whether a numpy.ndarray contains any `np.inf` values.
Parameters
----------
arr : np.ndarray or output of any Theano op
node : None or an Apply instance.
If the output of a Theano op, the node associated to it.
var : The Theano symbolic variable.
Returns
-------
contains_inf : bool
`True` if the array contains any `np.inf` values, `False` otherwise.
Notes
-----
Tests for the presence of `np.inf`'s by determining whether the
values returned by `np.nanmin(arr)` and `np.nanmax(arr)` are finite.
This approach is more memory efficient than the obvious alternative,
calling `np.any(np.isinf(ndarray))`, which requires the construction of a
boolean array with the same shape as the input array.
"""
if isinstance(arr, theano.gof.type._cdata_type):
return False
elif isinstance(arr, np.random.mtrand.RandomState):
return False
elif var and getattr(var.tag, 'is_rng', False):
return False
elif isinstance(arr, slice):
return False
elif arr.size == 0:
return False
elif cuda.cuda_available and isinstance(arr, cuda.CudaNdarray):
if (node and hasattr(theano.sandbox, 'rng_mrg') and
isinstance(
node.op,
# It store ints in float container
theano.sandbox.rng_mrg.GPU_mrg_uniform)):
return False
else:
compile_gpu_func(False, True, False)
return (np.isinf(f_gpumin(arr.reshape(arr.size))) or
np.isinf(f_gpumax(arr.reshape(arr.size))))
elif pygpu_available and isinstance(arr, GpuArray):
return (np.isinf(f_gpua_min(arr.reshape(arr.size))) or
np.isinf(f_gpua_max(arr.reshape(arr.size))))
return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
def traverse(out, x, x_copy, d, visited=None):
"""
Function used by scan to parse the tree and figure out which nodes
it needs to replace.
There are two options :
1) x and x_copy or on host, then you would replace x with x_copy
2) x is on gpu, x_copy on host, then you need to replace
host_from_gpu(x) with x_copy
This happens because initially shared variables are on GPU... which is
fine for the main computational graph but confuses things a bit for the
inner graph of scan.
"""
# ``visited`` is a set of nodes that are already known and don't need to be
# checked again, speeding up the traversal of multiply-connected graphs.
# if a ``visited`` set is given, it will be updated in-place so the callee
# knows which nodes we have seen.
if visited is None:
visited = set()
if out in visited:
return d
visited.add(out)
from theano.sandbox import cuda
from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu
from theano.gpuarray import pygpu_activated
from theano.gpuarray.type import GpuArrayType
if out == x:
if isinstance(x.type, cuda.CudaNdarrayType):
d[out] = cuda.gpu_from_host(x_copy)
else:
assert isinstance(x.type, GpuArrayType)
d[out] = gpu_from_host(x.type.context_name)(x_copy)
return d
elif out.owner is None:
return d
elif (cuda.cuda_available and
out.owner.op == cuda.host_from_gpu and
out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy)
return d
elif (pygpu_activated and
out.owner.op == host_from_gpu and
out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy)
return d
else:
for inp in out.owner.inputs:
d = traverse(inp, x, x_copy, d, visited)
return d
# Hashing a dictionary/list/tuple by xoring the hash of each element
def test_multinomial():
steps = 100
mode_ = mode
if mode == 'FAST_COMPILE':
mode_ = 'FAST_RUN'
if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
mode == 'Mode' and config.linker in ['py']):
sample_size = (49, 5)
else:
sample_size = (450, 6)
mode_ = theano.compile.mode.get_mode(mode_)
# print ''
# print 'ON CPU:'
pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
pvals = numpy.apply_along_axis(lambda row: row / numpy.sum(row), 1, pvals)
R = MRG_RandomStreams(234, use_cuda=False)
# Note: we specify `nstreams` to avoid a warning.
m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
f = theano.function([], m, mode=mode_)
# theano.printing.debugprint(f)
out = f()
basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
prefix='mrg ')
sys.stdout.flush()
if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
pvals = numpy.asarray(pvals, dtype='float32')
# We give the number of streams to avoid a warning.
n = R.multinomial(pvals=pvals, dtype='float32', nstreams=30 * 256)
# well, it's really that this test w GPU doesn't make sense otw
assert n.dtype == 'float32'
f = theano.function(
[],
theano.sandbox.cuda.basic_ops.gpu_from_host(n),
mode=mode_.including('gpu'))
# theano.printing.debugprint(f)
gpu_out = f()
sys.stdout.flush()
basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
prefix='gpu mrg ')
numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
direction):
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2)
subsample = (subsx, subsy)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
if direction == 'fprop':
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_img, npy_kern[:, :, ::-1, ::-1])
elif direction == 'bprop img':
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in
npy_kern.transpose(1, 0, 2, 3).shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_img.shape])()
cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
elif direction == 'bprop kern':
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in
npy_img.transpose(1, 0, 2, 3).shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in
npy_kern.transpose(1, 0, 2, 3).shape])()
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = numpy.array(f(
npy_img.transpose(1, 0, 2, 3),
npy_kern.transpose(1, 0, 2, 3)[:, :, ::-1, ::-1])
).transpose(1, 0, 2, 3)
assert_allclose(cpuval, gpuval, rtol=1e-4)