def test_multi_process_simultaneous(n_gpu=2, worker_func_maker=unpickle_func, bar_loop=False):
barrier = mp.Barrier(n_gpu)
if PROFILE:
target = sim_profiling_worker
else:
target = simultaneous_worker
procs = [mp.Process(target=target,
args=(rank, worker_func_maker, barrier, bar_loop))
for rank in range(1, n_gpu)]
for p in procs:
p.start()
theano.gpuarray.use("cuda0")
f_train, name = build_train_func()
barrier.wait()
# workers build or unpickle
time.sleep(1)
barrier.wait()
# workers are ready.
test_the_function(f_train, name=name, barrier=barrier, bar_loop=bar_loop)
for p in procs:
p.join()
python类gpuarray()的实例源码
def init_gpus(rank, n_parallel=None):
import theano
import theano.gpuarray
try:
theano.gpuarray.use("cuda" + str(rank))
except Exception as exc:
if n_parallel is not None:
raise exc("Master unable to use GPU.")
else:
sync.workers_OK.value = False
raise exc("Worker rank {} unable to use GPU.".format(rank))
finally:
sync.barrier_out.wait()
if n_parallel is not None:
if sync.workers_OK.value:
print("Synkhronos: {} GPUs initialized, master rank: {}".format(
n_parallel, rank))
else:
raise RuntimeError("Workers did not initialize GPUs.")
def test_one_process(gpu=0):
theano.gpuarray.use("cuda" + str(gpu))
f_train, train_name = build_train_func()
pickle_func(f_train)
f_unpkl, unpkl_name = unpickle_func()
test_the_function(f_train, train_name)
test_the_function(f_unpkl, unpkl_name)
def test_multi_process_sequence(n_gpu=2, worker_func_maker=unpickle_func):
barrier = mp.Barrier(n_gpu)
if PROFILE:
target = seq_profiling_worker
else:
target = sequence_worker
procs = [mp.Process(target=target,
args=(rank, n_gpu, barrier, worker_func_maker))
for rank in range(1, n_gpu)]
for p in procs:
p.start()
theano.gpuarray.use("cuda0")
f_train, name = build_train_func()
pickle_func(f_train)
barrier.wait()
# workers make function (maybe unpickle).
barrier.wait()
for i in range(n_gpu):
time.sleep(1)
barrier.wait()
if i == 0:
test_the_function(f_train, name)
for p in procs:
p.join()
def sequence_worker(rank, n_gpu, barrier, function_maker):
theano.gpuarray.use("cuda" + str(rank))
# maybe master makes the function
barrier.wait()
f_train, name = function_maker(rank=rank) # maybe unpickle
barrier.wait()
for i in range(n_gpu):
time.sleep(1)
barrier.wait()
if i == rank:
test_the_function(f_train, name=name, rank=rank)
def simultaneous_worker(rank, function_maker, barrier, bar_loop):
theano.gpuarray.use("cuda" + str(rank))
# maybe master makes the function
barrier.wait()
f_train, name = function_maker(rank)
barrier.wait()
test_the_function(f_train, name=name, rank=rank, barrier=barrier, bar_loop=bar_loop)
def __init__(self, n_gpu, rank, master_rank):
gpu_ctx = theano.gpuarray.get_context(None)
clique_id = gpu_coll.GpuCommCliqueId(gpu_ctx)
if rank == master_rank:
sync.dict["gpu_comm_id"] = clique_id.comm_id
sync.barrier.wait()
else:
sync.barrier.wait()
clique_id.comm_id = sync.dict["gpu_comm_id"]
self.comm = gpu_coll.GpuComm(clique_id, n_gpu, rank)
self.n_gpu = n_gpu
self.avg_fac = 1. / n_gpu
self.master_rank = master_rank
def init_device(device='gpu0'):
if device.startswith('cuda'):
import os
if 'THEANO_FLAGS' in os.environ:
raise ValueError('Use theanorc to set the theano config')
os.environ['THEANO_FLAGS'] = 'device={0}'.format(device)
import theano.gpuarray
# This is a bit of black magic that may stop working in future
# theano releases
ctx = theano.gpuarray.type.get_context(None)
drv = None
elif device.startswith('gpu'):
gpuid = int(device[-1])
import pycuda.driver as drv
drv.init()
dev = drv.Device(gpuid)
ctx = dev.make_context()
import theano.sandbox.cuda
theano.sandbox.cuda.use(device)
import theano
else:
drv=None
ctx=None
import theano.sandbox.cuda
theano.sandbox.cuda.use(device)
import theano
from theano import function, config, shared, sandbox, tensor
vlen = 10 * 30 * 768 # 10 x #cores x # threads per core
iters = 1000
rng = np.random.RandomState(22)
arr = rng.rand(vlen)
shared_x = theano.shared(np.asarray(arr, config.floatX))
shared_xx = theano.shared(np.asarray(arr, config.floatX))
x=tensor.fvector("x")
# compile a function so that shared_x will be set to part of a computing graph on GPU (CUDAndarray)
f = function([], tensor.exp(x), givens=[(x,shared_x)])
if np.any([isinstance(x.op, tensor.Elemwise) and
('Gpu' not in type(x.op).__name__)
for x in f.maker.fgraph.toposort()]):
print('Used the cpu')
else:
print('Used the gpu')
# if np.any([isinstance(x.op, tensor.Elemwise) for x in f.maker.fgraph.toposort()]) and device!='cpu':
# raise TypeError('graph not compiled on GPU')
return drv,ctx, arr, shared_x, shared_xx
def traverse(out, x, x_copy, d, visited=None):
"""
Function used by scan to parse the tree and figure out which nodes
it needs to replace.
There are two options :
1) x and x_copy or on host, then you would replace x with x_copy
2) x is on gpu, x_copy on host, then you need to replace
host_from_gpu(x) with x_copy
This happens because initially shared variables are on GPU... which is
fine for the main computational graph but confuses things a bit for the
inner graph of scan.
"""
# ``visited`` is a set of nodes that are already known and don't need to be
# checked again, speeding up the traversal of multiply-connected graphs.
# if a ``visited`` set is given, it will be updated in-place so the callee
# knows which nodes we have seen.
if visited is None:
visited = set()
if out in visited:
return d
visited.add(out)
from theano.sandbox import cuda
from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu
from theano.gpuarray import pygpu_activated
from theano.gpuarray.type import GpuArrayType
if out == x:
if isinstance(x.type, cuda.CudaNdarrayType):
d[out] = cuda.gpu_from_host(x_copy)
else:
assert isinstance(x.type, GpuArrayType)
d[out] = gpu_from_host(x.type.context_name)(x_copy)
return d
elif out.owner is None:
return d
elif (cuda.cuda_available and
out.owner.op == cuda.host_from_gpu and
out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy)
return d
elif (pygpu_activated and
out.owner.op == host_from_gpu and
out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy)
return d
else:
for inp in out.owner.inputs:
d = traverse(inp, x, x_copy, d, visited)
return d
# Hashing a dictionary/list/tuple by xoring the hash of each element