def test_copy_device(self):
x = torch.randn(5, 5).cuda()
with torch.cuda.device(1):
y = x.cuda()
self.assertEqual(y.get_device(), 1)
self.assertIs(y.cuda(), y)
z = y.cuda(0)
self.assertEqual(z.get_device(), 0)
self.assertIs(z.cuda(0), z)
x = torch.randn(5, 5)
with torch.cuda.device(1):
y = x.cuda()
self.assertEqual(y.get_device(), 1)
self.assertIs(y.cuda(), y)
z = y.cuda(0)
self.assertEqual(z.get_device(), 0)
self.assertIs(z.cuda(0), z)
python类cuda()的实例源码
def test_broadcast_coalesced(self):
numel = 5
num_bytes = numel * 8
tensors = [
torch.randn(numel).long().cuda(),
torch.randn(numel).cuda(),
torch.randn(numel).long().cuda(),
torch.randn(numel).long().cuda(),
torch.randn(numel * 2).int().cuda(), # int is 2x shorter
torch.randn(numel).cuda(),
]
b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors]
for (_, bt), t in zip(b_tensors, tensors):
self.assertEqual(bt.get_device(), 1)
self.assertEqual(bt, t)
self.assertIsInstance(bt, type(t))
bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=num_bytes * 5 // 2)
bc_tensors_t = list(zip(*bc_tensors))
self.assertEqual(b_tensors, bc_tensors_t)
for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t):
self.assertEqual(bt.get_device(), bct.get_device())
self.assertIsInstance(bct, type(bt))
def test_streams(self):
default_stream = torch.cuda.current_stream()
user_stream = torch.cuda.Stream()
self.assertEqual(torch.cuda.current_stream(), default_stream)
self.assertNotEqual(default_stream, user_stream)
self.assertEqual(default_stream.cuda_stream, 0)
self.assertNotEqual(user_stream.cuda_stream, 0)
with torch.cuda.stream(user_stream):
self.assertEqual(torch.cuda.current_stream(), user_stream)
self.assertTrue(user_stream.query())
# copy 10 MB tensor from CPU-GPU which should take some time
tensor1 = torch.ByteTensor(10000000).pin_memory()
tensor2 = tensor1.cuda(async=True)
self.assertFalse(default_stream.query())
default_stream.synchronize()
self.assertTrue(default_stream.query())
def test_caching_pinned_memory(self):
cycles_per_ms = get_cycles_per_ms()
# check that allocations are re-used after deletion
t = torch.FloatTensor([1]).pin_memory()
ptr = t.data_ptr()
del t
t = torch.FloatTensor([1]).pin_memory()
self.assertEqual(t.data_ptr(), ptr, 'allocation not reused')
# check that the allocation is not re-used if it's in-use by a copy
gpu_tensor = torch.cuda.FloatTensor([0])
torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy
gpu_tensor.copy_(t, async=True)
del t
t = torch.FloatTensor([1]).pin_memory()
self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
self.assertEqual(list(gpu_tensor), [1])
def test_caching_pinned_memory_multi_gpu(self):
# checks that the events preventing pinned memory from being re-used
# too early are recorded on the correct GPU
cycles_per_ms = get_cycles_per_ms()
t = torch.FloatTensor([1]).pin_memory()
ptr = t.data_ptr()
gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)
with torch.cuda.device(1):
torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy
gpu_tensor1.copy_(t, async=True)
del t
t = torch.FloatTensor([2]).pin_memory()
self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
with torch.cuda.device(0):
gpu_tensor0.copy_(t, async=True)
self.assertEqual(gpu_tensor1[0], 1)
self.assertEqual(gpu_tensor0[0], 2)
def test_serialization_map_location(self):
DATA_URL = 'https://download.pytorch.org/test_data/gpu_tensors.pt'
data_dir = os.path.join(os.path.dirname(__file__), 'data')
test_file_path = os.path.join(data_dir, 'gpu_tensors.pt')
succ = download_file(DATA_URL, test_file_path)
if not succ:
warnings.warn(
"Couldn't download the test file for map_location! "
"Tests will be incomplete!", RuntimeWarning)
return
def map_location(storage, loc):
return storage
tensor = torch.load(test_file_path, map_location=map_location)
self.assertEqual(type(tensor), torch.FloatTensor)
self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))
tensor = torch.load(test_file_path, map_location={'cuda:0': 'cpu'})
self.assertEqual(type(tensor), torch.FloatTensor)
self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))
def test_print(self):
for t in torch._tensor_classes:
if t in torch.sparse._sparse_tensor_classes:
continue
if t.is_cuda and not torch.cuda.is_available():
continue
obj = t(100, 100).fill_(1)
obj.__repr__()
str(obj)
for t in torch._storage_classes:
if t.is_cuda and not torch.cuda.is_available():
continue
obj = t(100).fill_(1)
obj.__repr__()
str(obj)
x = torch.Tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1])
x.__repr__()
str(x)
def test_reduce_scatter(self):
in_size = 32 * nGPUs
out_size = 32
inputs = [torch.FloatTensor(in_size).uniform_() for i in range(nGPUs)]
expected = torch.FloatTensor(in_size).zero_()
for t in inputs:
expected.add_(t)
expected = expected.view(nGPUs, 32)
inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
outputs = [torch.cuda.FloatTensor(out_size, device=i)
for i in range(nGPUs)]
nccl.reduce_scatter(inputs, outputs)
for i in range(nGPUs):
self.assertEqual(outputs[i], expected[i])
def __init__(self, size, vocab, cuda=False):
"""Initialize params."""
self.size = size
self.done = False
self.pad = vocab['<pad>']
self.bos = vocab['<s>']
self.eos = vocab['</s>']
self.tt = torch.cuda if cuda else torch
# The score for each translation on the beam.
self.scores = self.tt.FloatTensor(size).zero_()
# The backpointers at each time-step.
self.prevKs = []
# The outputs at each time-step.
self.nextYs = [self.tt.LongTensor(size).fill_(self.pad)]
self.nextYs[0][0] = self.bos
# The attentions (matrix) for each time.
self.attn = []
# Get the outputs for the current timestep.
def _generate_typedefs():
typedefs = []
for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
for lib in ['TH', 'THCuda']:
for kind in ['Tensor', 'Storage']:
python_name = t + kind
if t == 'Float' and lib == 'THCuda':
th_name = 'THCuda' + kind
else:
th_name = lib + t + kind
th_struct = 'struct ' + th_name
typedefs += ['typedef {} {};'.format(th_struct, th_name)]
module = torch if lib == 'TH' else torch.cuda
python_class = getattr(module, python_name)
_cffi_to_torch[th_struct] = python_class
_torch_to_cffi[python_class] = th_struct
return '\n'.join(typedefs) + '\n'
def _setup_wrapper(with_cuda):
here = os.path.abspath(os.path.dirname(__file__))
lib_dir = os.path.join(here, '..', '..', 'lib')
include_dirs = [
os.path.join(lib_dir, 'include'),
os.path.join(lib_dir, 'include', 'TH'),
]
wrapper_source = '#include <TH/TH.h>\n'
if with_cuda:
import torch.cuda
wrapper_source += '#include <THC/THC.h>\n'
cuda_include_dirs = glob.glob('/usr/local/cuda/include')
cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
include_dirs.extend(cuda_include_dirs)
return wrapper_source, include_dirs
def test_cuda_small_tensors(self):
# Check multiple small tensors which will likely use the same
# underlying cached allocation
ctx = mp.get_context('spawn')
tensors = []
for i in range(5):
tensors += [torch.arange(i * 5, (i + 1) * 5).cuda()]
inq = ctx.Queue()
outq = ctx.Queue()
inq.put(tensors)
p = ctx.Process(target=sum_tensors, args=(inq, outq))
p.start()
results = []
for i in range(5):
results.append(outq.get())
p.join()
for i, tensor in enumerate(tensors):
v, device, tensor_size, storage_size = results[i]
self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum())
self.assertEqual(device, 0)
self.assertEqual(tensor_size, 5)
self.assertEqual(storage_size, 5)
def test_event(self):
ctx = mp.get_context('spawn')
queue = ctx.Queue()
ready = ctx.Event()
done = ctx.Event()
p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done))
p.start()
ready.wait()
with torch.cuda.stream(torch.cuda.Stream()):
tensor = torch.cuda.FloatTensor([1, 1, 1, 1])
# Use a sleep kernel to test events. Without the event, the
# multiply happens before the add.
event = torch.cuda.Event(interprocess=True)
torch.cuda._sleep(20000000) # about 30 ms
tensor.add_(1)
event.record()
queue.put((event, tensor))
done.wait() # must wait until subprocess records event
event.synchronize()
self.assertEqual(list(tensor), [4, 4, 4, 4])
p.join()
def test_gpu(self):
compile_extension(
name='gpulib',
header=test_dir + '/ffi/src/cuda/cudalib.h',
sources=[
test_dir + '/ffi/src/cuda/cudalib.c',
],
with_cuda=True,
verbose=False,
)
import gpulib
tensor = torch.ones(2, 2).float()
gpulib.good_func(tensor, 2, 1.5)
self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
ctensor = tensor.cuda().fill_(1)
gpulib.cuda_func(ctensor, 2, 1.5)
self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)
self.assertRaises(TypeError,
lambda: gpulib.cuda_func(tensor, 2, 1.5))
self.assertRaises(TypeError,
lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
def test_copy_device(self):
x = torch.randn(5, 5).cuda()
with torch.cuda.device(1):
y = x.cuda()
self.assertEqual(y.get_device(), 1)
self.assertIs(y.cuda(), y)
z = y.cuda(0)
self.assertEqual(z.get_device(), 0)
self.assertIs(z.cuda(0), z)
x = torch.randn(5, 5)
with torch.cuda.device(1):
y = x.cuda()
self.assertEqual(y.get_device(), 1)
self.assertIs(y.cuda(), y)
z = y.cuda(0)
self.assertEqual(z.get_device(), 0)
self.assertIs(z.cuda(0), z)
def test_broadcast_coalesced(self):
numel = 5
num_bytes = numel * 8
tensors = [
torch.randn(numel).long().cuda(),
torch.randn(numel).cuda(),
torch.randn(numel).long().cuda(),
torch.randn(numel).long().cuda(),
torch.randn(numel * 2).int().cuda(), # int is 2x shorter
torch.randn(numel).cuda(),
]
b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors]
for (_, bt), t in zip(b_tensors, tensors):
self.assertEqual(bt.get_device(), 1)
self.assertEqual(bt, t)
self.assertIsInstance(bt, type(t))
bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=num_bytes * 5 // 2)
bc_tensors_t = list(zip(*bc_tensors))
self.assertEqual(b_tensors, bc_tensors_t)
for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t):
self.assertEqual(bt.get_device(), bct.get_device())
self.assertIsInstance(bct, type(bt))
def test_streams(self):
default_stream = torch.cuda.current_stream()
user_stream = torch.cuda.Stream()
self.assertEqual(torch.cuda.current_stream(), default_stream)
self.assertNotEqual(default_stream, user_stream)
self.assertEqual(default_stream.cuda_stream, 0)
self.assertNotEqual(user_stream.cuda_stream, 0)
with torch.cuda.stream(user_stream):
self.assertEqual(torch.cuda.current_stream(), user_stream)
self.assertTrue(user_stream.query())
# copy 10 MB tensor from CPU-GPU which should take some time
tensor1 = torch.ByteTensor(10000000).pin_memory()
tensor2 = tensor1.cuda(async=True)
self.assertFalse(default_stream.query())
default_stream.synchronize()
self.assertTrue(default_stream.query())
def test_caching_pinned_memory(self):
cycles_per_ms = get_cycles_per_ms()
# check that allocations are re-used after deletion
t = torch.FloatTensor([1]).pin_memory()
ptr = t.data_ptr()
del t
t = torch.FloatTensor([1]).pin_memory()
self.assertEqual(t.data_ptr(), ptr, 'allocation not reused')
# check that the allocation is not re-used if it's in-use by a copy
gpu_tensor = torch.cuda.FloatTensor([0])
torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy
gpu_tensor.copy_(t, async=True)
del t
t = torch.FloatTensor([1]).pin_memory()
self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
self.assertEqual(list(gpu_tensor), [1])
def test_caching_pinned_memory_multi_gpu(self):
# checks that the events preventing pinned memory from being re-used
# too early are recorded on the correct GPU
cycles_per_ms = get_cycles_per_ms()
t = torch.FloatTensor([1]).pin_memory()
ptr = t.data_ptr()
gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)
with torch.cuda.device(1):
torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy
gpu_tensor1.copy_(t, async=True)
del t
t = torch.FloatTensor([2]).pin_memory()
self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
with torch.cuda.device(0):
gpu_tensor0.copy_(t, async=True)
self.assertEqual(gpu_tensor1[0], 1)
self.assertEqual(gpu_tensor0[0], 2)
def test_serialization_map_location(self):
DATA_URL = 'https://download.pytorch.org/test_data/gpu_tensors.pt'
data_dir = os.path.join(os.path.dirname(__file__), 'data')
test_file_path = os.path.join(data_dir, 'gpu_tensors.pt')
succ = download_file(DATA_URL, test_file_path)
if not succ:
warnings.warn(
"Couldn't download the test file for map_location! "
"Tests will be incomplete!", RuntimeWarning)
return
def map_location(storage, loc):
return storage
tensor = torch.load(test_file_path, map_location=map_location)
self.assertEqual(type(tensor), torch.FloatTensor)
self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))
tensor = torch.load(test_file_path, map_location={'cuda:0': 'cpu'})
self.assertEqual(type(tensor), torch.FloatTensor)
self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))