def __init__(self, model_source="model", cuda=False):
self.torch = torch.cuda if cuda else torch
self.cuda = cuda
if self.cuda:
model_source = torch.load(model_source)
else:
model_source = torch.load(model_source, map_location=lambda storage, loc: storage)
self.src_dict = model_source["src_dict"]
self.trains_score = model_source["trains_score"]
self.args = args = model_source["settings"]
model = BiLSTM_Cut(args)
model.load_state_dict(model_source['model'])
if self.cuda:
model = model.cuda()
model.prob_projection = nn.Softmax().cuda()
else:
model = model.cpu()
model.prob_projection = nn.Softmax().cpu()
self.model = model.eval()
python类cuda()的实例源码
def evaluate_performance(ladder, valid_loader, e, agg_cost_scaled, agg_supervised_cost_scaled,
agg_unsupervised_cost_scaled, args):
correct = 0.
total = 0.
for batch_idx, (data, target) in enumerate(valid_loader):
if args.cuda:
data = data.cuda()
data, target = Variable(data), Variable(target)
output = ladder.forward_encoders_clean(data)
# TODO: Do away with the below hack for GPU tensors.
if args.cuda:
output = output.cpu()
target = target.cpu()
output = output.data.numpy()
preds = np.argmax(output, axis=1)
target = target.data.numpy()
correct += np.sum(target == preds)
total += target.shape[0]
print("Epoch:", e + 1, "\t",
"Total Cost:", "{:.4f}".format(agg_cost_scaled), "\t",
"Supervised Cost:", "{:.4f}".format(agg_supervised_cost_scaled), "\t",
"Unsupervised Cost:", "{:.4f}".format(agg_unsupervised_cost_scaled), "\t",
"Validation Accuracy:", correct / total)
def _generate_typedefs():
typedefs = []
for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
for lib in ['TH', 'THCuda']:
for kind in ['Tensor', 'Storage']:
python_name = t + kind
if t == 'Float' and lib == 'THCuda':
th_name = 'THCuda' + kind
else:
th_name = lib + t + kind
th_struct = 'struct ' + th_name
typedefs += ['typedef {} {};'.format(th_struct, th_name)]
module = torch if lib == 'TH' else torch.cuda
python_class = getattr(module, python_name)
_cffi_to_torch[th_struct] = python_class
_torch_to_cffi[python_class] = th_struct
return '\n'.join(typedefs) + '\n'
def _setup_wrapper(with_cuda):
here = os.path.abspath(os.path.dirname(__file__))
lib_dir = os.path.join(here, '..', '..', 'lib')
include_dirs = [
os.path.join(lib_dir, 'include'),
os.path.join(lib_dir, 'include', 'TH'),
]
wrapper_source = '#include <TH/TH.h>\n'
if with_cuda:
import torch.cuda
wrapper_source += '#include <THC/THC.h>\n'
cuda_include_dirs = glob.glob('/usr/local/cuda/include')
cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
include_dirs.extend(cuda_include_dirs)
return wrapper_source, include_dirs
def test_gpu(self):
compile_extension(
name='gpulib',
header=test_dir + '/ffi/src/cuda/cudalib.h',
sources=[
test_dir + '/ffi/src/cuda/cudalib.c',
],
with_cuda=True,
verbose=False,
)
import gpulib
tensor = torch.ones(2, 2).float()
gpulib.good_func(tensor, 2, 1.5)
self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
ctensor = tensor.cuda().fill_(1)
gpulib.cuda_func(ctensor, 2, 1.5)
self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)
self.assertRaises(TypeError,
lambda: gpulib.cuda_func(tensor, 2, 1.5))
self.assertRaises(TypeError,
lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
def test_serialization(self):
x = torch.randn(5, 5).cuda()
y = torch.IntTensor(2, 5).fill_(0).cuda()
q = [x, y, x, y.storage()]
with tempfile.NamedTemporaryFile() as f:
torch.save(q, f)
f.seek(0)
q_copy = torch.load(f)
self.assertEqual(q_copy, q, 0)
q_copy[0].fill_(5)
self.assertEqual(q_copy[0], q_copy[2], 0)
self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor))
self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor))
self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage))
q_copy[1].fill_(10)
self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
def _test_gather(self, dim):
if torch.cuda.device_count() < 2:
raise unittest.SkipTest("only one GPU detected")
x = torch.randn(2, 5).cuda(0)
y = torch.randn(2, 5).cuda(1)
result = comm.gather((x, y), dim)
expected_size = list(x.size())
expected_size[dim] += y.size(dim)
expected_size = torch.Size(expected_size)
self.assertEqual(result.get_device(), 0)
self.assertEqual(result.size(), expected_size)
index = [slice(None, None), slice(None, None)]
index[dim] = slice(0, x.size(dim))
self.assertEqual(result[tuple(index)], x)
index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim))
self.assertEqual(result[tuple(index)], y)
def test_cuda(self, test_case):
if not TEST_CUDA or not self.should_test_cuda:
raise unittest.SkipTest('Excluded from CUDA tests')
try:
cpu_input = self._get_input()
type_map = {
torch.DoubleTensor: torch.cuda.FloatTensor,
}
gpu_input = to_gpu(cpu_input, type_map=type_map)
cpu_target = self.target
gpu_target = to_gpu(self.target, type_map=type_map)
cpu_module = self.constructor(*self.constructor_args)
gpu_module = self.constructor(*self.constructor_args).float().cuda()
cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
test_case.assertEqual(cpu_output, gpu_output, 2e-4)
cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4)
except NotImplementedError:
pass
def test_print(self):
for t in torch._tensor_classes:
if t.is_cuda and not torch.cuda.is_available():
continue
obj = t(100, 100).fill_(1)
obj.__repr__()
str(obj)
for t in torch._storage_classes:
if t.is_cuda and not torch.cuda.is_available():
continue
obj = t(100).fill_(1)
obj.__repr__()
str(obj)
x = torch.Tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1])
x.__repr__()
str(x)
def test_reduce_scatter(self):
in_size = 32 * nGPUs
out_size = 32
inputs = [torch.FloatTensor(in_size).uniform_() for i in range(nGPUs)]
expected = torch.FloatTensor(in_size).zero_()
for t in inputs:
expected.add_(t)
expected = expected.view(nGPUs, 32)
inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
outputs = [torch.cuda.FloatTensor(out_size, device=i)
for i in range(nGPUs)]
nccl.reduce_scatter(inputs, outputs)
for i in range(nGPUs):
self.assertEqual(outputs[i], expected[i])
def trainepoch(self, X, y, nepoches=1):
self.model.train()
for _ in range(self.nepoch, self.nepoch + nepoches):
permutation = np.random.permutation(len(X))
all_costs = []
for i in range(0, len(X), self.batch_size):
# forward
idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda()
Xbatch = Variable(X.index_select(0, idx))
ybatch = Variable(y.index_select(0, idx))
output = self.model(Xbatch)
# loss
loss = self.loss_fn(output, ybatch)
all_costs.append(loss.data[0])
# backward
self.optimizer.zero_grad()
loss.backward()
# Update parameters
self.optimizer.step()
self.nepoch += nepoches
def __init__(self, model_source, cuda=False, beam_size=3):
self.torch = torch.cuda if cuda else torch
self.cuda = cuda
self.beam_size = beam_size
if self.cuda:
model_source = torch.load(model_source)
else:
model_source = torch.load(model_source, map_location=lambda storage, loc: storage)
self.src_dict = model_source["src_dict"]
self.tgt_dict = model_source["tgt_dict"]
self.src_idx2word = {v: k for k, v in model_source["tgt_dict"].items()}
self.args = args = model_source["settings"]
model = Transformer(args)
model.load_state_dict(model_source['model'])
if self.cuda: model = model.cuda()
else: model = model.cpu()
self.model = model.eval()
def sent2tenosr(self, sentence):
max_len = self.args.max_word_len - 2
sentence = normalizeString(sentence)
words = [w for w in sentence.strip().split()]
if len(words) > max_len:
words = words[:max_len]
words = [WORD[BOS]] + words + [WORD[EOS]]
idx = [self.src_dict[w] if w in self.src_dict else UNK for w in words]
idx_data = torch.LongTensor(idx)
idx_position = torch.LongTensor([pos_i+1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx)])
idx_data_tensor = Variable(idx_data.unsqueeze(0), volatile=True)
idx_position_tensor = Variable(idx_position.unsqueeze(0), volatile=True)
if self.cuda:
idx_data_tensor = idx_data_tensor.cuda()
idx_position_tensor = idx_position_tensor.cuda()
return idx_data_tensor, idx_position_tensor
def __init__(self, model_source, cuda=False, beam_size=3):
self.torch = torch.cuda if cuda else torch
self.cuda = cuda
self.jb = Jieba("./segmenter_dicts", useSynonym=True, HMM=False)
self.swf = StopwordFilter("./segmenter_dicts/stopwords.txt")
model_source = torch.load(model_source)
self.src_dict = model_source["src_dict"]
self.tgt_dict = model_source["tgt_dict"]
self.src_idx2ind = {v: k for k, v in model_source["tgt_dict"].items()}
self.args = args = model_source["settings"]
model = CNN_Ranking(args)
model.load_state_dict(model_source['model'])
if self.cuda:
model = model.cuda()
else:
model = model.cpu()
self.model = model.eval()
def __init__(self, model_source, cuda=False):
self.torch = torch.cuda if cuda else torch
self.cuda = cuda
if self.cuda:
model_source = torch.load(model_source)
else:
model_source = torch.load(model_source, map_location=lambda storage, loc: storage)
self.src_dict = model_source["src_dict"]
self.trains_score = model_source["trains_score"]
self.args = args = model_source["settings"]
model = BiLSTM_CRF_Size(args)
model.load_state_dict(model_source['model'])
if self.cuda:
model = model.cuda()
model.prob_projection = nn.Softmax().cuda()
else:
model = model.cpu()
model.prob_projection = nn.Softmax().cpu()
self.model = model.eval()
def __init__(self, size, cuda=False):
self.size = size
self.done = False
self.tt = torch.cuda if cuda else torch
# The score for each translation on the beam.
self.scores = self.tt.FloatTensor(size).zero_()
self.all_scores = []
# The backpointers at each time-step.
self.prev_ks = []
# The outputs at each time-step.
self.next_ys = [self.tt.LongTensor(size).fill_(Constants.PAD)]
self.next_ys[0][0] = Constants.BOS
def __init__(self, args, model, criterion, device_ids=None,
multiprocessing_method='spawn'):
if device_ids is None:
device_ids = tuple(range(torch.cuda.device_count()))
super().__init__(device_ids, multiprocessing_method)
if not torch.cuda.is_available():
raise NotImplementedError('Training on CPU is not supported')
model = model.share_memory()
nccl_uid = nccl.get_unique_id()
self.criterion = criterion
Future.gen_list([
self.call_async(rank, '_async_init', args=args, model=model,
criterion=criterion, nccl_uid=nccl_uid)
for rank in range(self.num_replicas)
])
self._grads_initialized = False
def _async_init(self, rank, device_id, args, model, criterion, nccl_uid):
"""Initialize child processes."""
self.args = args
# set CUDA device
torch.cuda.set_device(device_id)
# initialize NCCL
nccl.initialize(self.num_replicas, nccl_uid, device_id)
# copy model and criterion to current device
self.model = model.cuda()
self.criterion = criterion.cuda()
# initialize optimizer and LR scheduler
self.args.lr = list(map(float, self.args.lr.split(',')))
self.optimizer = self._build_optimizer()
self.lr_scheduler = self._build_lr_scheduler()
self.loss = None
self._max_bsz_seen = 0
def _async_forward(self, rank, device_id, eval=False):
if eval:
self.model.eval()
else:
self.model.train()
self.optimizer.zero_grad()
sample_size, logging_output, oom = 0, {}, False
if self._sample is not None:
try:
# calculate loss and sample size
self.loss, sample_size, logging_output = self.criterion(self.model, self._sample)
except RuntimeError as e:
if not eval and 'out of memory' in str(e):
print('| WARNING: ran out of memory on GPU #{}, skipping batch'.format(device_id))
oom = True
self.loss = None
if hasattr(torch.cuda, 'empty_cache'):
torch.cuda.empty_cache()
else:
raise e
return sample_size, logging_output, oom
def __init__(self, size, cuda=False):
self.size = size
self.done = False
self.tt = torch.cuda if cuda else torch
# The score for each translation on the beam.
self.scores = self.tt.FloatTensor(size).zero_()
self.allScores = []
# The backpointers at each time-step.
self.prevKs = []
# The outputs at each time-step.
self.nextYs = [self.tt.LongTensor(size).fill_(onmt.Constants.PAD)]
self.nextYs[0][0] = onmt.Constants.BOS
# The attentions (matrix) for each time.
self.attn = []
def buildData(self, srcBatch, goldBatch):
# This needs to be the same as preprocess.py.
if self._type == "text":
srcData = [self.src_dict.convertToIdx(b,
onmt.Constants.UNK_WORD)
for b in srcBatch]
elif self._type == "img":
srcData = [transforms.ToTensor()(
Image.open(self.opt.src_img_dir + "/" + b[0]))
for b in srcBatch]
tgtData = None
if goldBatch:
tgtData = [self.tgt_dict.convertToIdx(b,
onmt.Constants.UNK_WORD,
onmt.Constants.BOS_WORD,
onmt.Constants.EOS_WORD) for b in goldBatch]
return onmt.Dataset(srcData, tgtData, self.opt.batch_size,
self.opt.cuda, volatile=True,
data_type=self._type)
def __init__(self, X, Y, hidden_layer_sizes):
super(Net, self).__init__()
# Initialize linear layer with least squares solution
X_ = np.hstack([X, np.ones((X.shape[0],1))])
Theta = np.linalg.solve(X_.T.dot(X_), X_.T.dot(Y))
self.lin = nn.Linear(X.shape[1], Y.shape[1])
W,b = self.lin.parameters()
W.data = torch.Tensor(Theta[:-1,:].T)
b.data = torch.Tensor(Theta[-1,:])
# Set up non-linear network of
# Linear -> BatchNorm -> ReLU -> Dropout layers
layer_sizes = [X.shape[1]] + hidden_layer_sizes
layers = reduce(operator.add,
[[nn.Linear(a,b), nn.BatchNorm1d(b), nn.ReLU(), nn.Dropout(p=0.2)]
for a,b in zip(layer_sizes[0:-1], layer_sizes[1:])])
layers += [nn.Linear(layer_sizes[-1], Y.shape[1])]
self.net = nn.Sequential(*layers)
self.sig = Parameter(torch.ones(1, Y.shape[1]).cuda())
def __init__(self, params, eps=1e-2):
super(SolveNewsvendor, self).__init__()
k = len(params['d'])
self.Q = Variable(torch.diag(torch.Tensor(
[params['c_quad']] + [params['b_quad']]*k + [params['h_quad']]*k)) \
.cuda())
self.p = Variable(torch.Tensor(
[params['c_lin']] + [params['b_lin']]*k + [params['h_lin']]*k) \
.cuda())
self.G = Variable(torch.cat([
torch.cat([-torch.ones(k,1), -torch.eye(k), torch.zeros(k,k)], 1),
torch.cat([torch.ones(k,1), torch.zeros(k,k), -torch.eye(k)], 1),
-torch.eye(1 + 2*k)], 0).cuda())
self.h = Variable(torch.Tensor(
np.concatenate([-params['d'], params['d'], np.zeros(1+ 2*k)])).cuda())
self.one = Variable(torch.Tensor([1])).cuda()
self.eps_eye = eps * Variable(torch.eye(1 + 2*k).cuda()).unsqueeze(0)
def forward(self, y):
nBatch, k = y.size()
Q_scale = torch.cat([torch.diag(torch.cat(
[self.one, y[i], y[i]])).unsqueeze(0) for i in range(nBatch)], 0)
Q = self.Q.unsqueeze(0).expand_as(Q_scale).mul(Q_scale)
p_scale = torch.cat([Variable(torch.ones(nBatch,1).cuda()), y, y], 1)
p = self.p.unsqueeze(0).expand_as(p_scale).mul(p_scale)
G = self.G.unsqueeze(0).expand(nBatch, self.G.size(0), self.G.size(1))
h = self.h.unsqueeze(0).expand(nBatch, self.h.size(0))
e = Variable(torch.Tensor().cuda()).double()
out = QPFunction(verbose=False)\
(Q.double(), p.double(), G.double(), h.double(), e, e).float()
return out[:,:1]
def __init__(self, size, cuda=False):
self.size = size
self.done = False
self.tt = torch.cuda if cuda else torch
# The score for each translation on the beam.
self.scores = self.tt.FloatTensor(size).zero_()
# The backpointers at each time-step.
self.prevKs = []
# The outputs at each time-step.
self.nextYs = [self.tt.LongTensor(size).fill_(onmt.Constants.PAD)]
self.nextYs[0][0] = onmt.Constants.BOS
# The attentions (matrix) for each time.
self.attn = []
# Get the outputs for the current timestep.
def _generate_typedefs():
typedefs = []
for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
for lib in ['TH', 'THCuda']:
for kind in ['Tensor', 'Storage']:
python_name = t + kind
if t == 'Float' and lib == 'THCuda':
th_name = 'THCuda' + kind
else:
th_name = lib + t + kind
th_struct = 'struct ' + th_name
typedefs += ['typedef {} {};'.format(th_struct, th_name)]
module = torch if lib == 'TH' else torch.cuda
python_class = getattr(module, python_name)
_cffi_to_torch[th_struct] = python_class
_torch_to_cffi[python_class] = th_struct
return '\n'.join(typedefs) + '\n'
def _setup_wrapper(with_cuda):
here = os.path.abspath(os.path.dirname(__file__))
lib_dir = os.path.join(here, '..', '..', 'lib')
include_dirs = [
os.path.join(lib_dir, 'include'),
os.path.join(lib_dir, 'include', 'TH'),
]
wrapper_source = '#include <TH/TH.h>\n'
if with_cuda:
import torch.cuda
wrapper_source += '#include <THC/THC.h>\n'
cuda_include_dirs = glob.glob('/usr/local/cuda/include')
cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
include_dirs.extend(cuda_include_dirs)
return wrapper_source, include_dirs
def test_cuda_small_tensors(self):
# Check multiple small tensors which will likely use the same
# underlying cached allocation
ctx = mp.get_context('spawn')
tensors = []
for i in range(5):
tensors += [torch.arange(i * 5, (i + 1) * 5).cuda()]
inq = ctx.Queue()
outq = ctx.Queue()
inq.put(tensors)
p = ctx.Process(target=sum_tensors, args=(inq, outq))
p.start()
results = []
for i in range(5):
results.append(outq.get())
p.join()
for i, tensor in enumerate(tensors):
v, device, tensor_size, storage_size = results[i]
self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum())
self.assertEqual(device, 0)
self.assertEqual(tensor_size, 5)
self.assertEqual(storage_size, 5)
def test_event(self):
ctx = mp.get_context('spawn')
queue = ctx.Queue()
ready = ctx.Event()
done = ctx.Event()
p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done))
p.start()
ready.wait()
with torch.cuda.stream(torch.cuda.Stream()):
tensor = torch.cuda.FloatTensor([1, 1, 1, 1])
# Use a sleep kernel to test events. Without the event, the
# multiply happens before the add.
event = torch.cuda.Event(interprocess=True)
torch.cuda._sleep(20000000) # about 30 ms
tensor.add_(1)
event.record()
queue.put((event, tensor))
done.wait() # must wait until subprocess records event
event.synchronize()
self.assertEqual(list(tensor), [4, 4, 4, 4])
p.join()
def test_gpu(self):
compile_extension(
name='gpulib',
header=test_dir + '/ffi/src/cuda/cudalib.h',
sources=[
test_dir + '/ffi/src/cuda/cudalib.c',
],
with_cuda=True,
verbose=False,
)
import gpulib
tensor = torch.ones(2, 2).float()
gpulib.good_func(tensor, 2, 1.5)
self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
ctensor = tensor.cuda().fill_(1)
gpulib.cuda_func(ctensor, 2, 1.5)
self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)
self.assertRaises(TypeError,
lambda: gpulib.cuda_func(tensor, 2, 1.5))
self.assertRaises(TypeError,
lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))