def setup_workers(self):
# work only once
if self._initialized:
return
self._initialized = True
self.model.cleargrads()
for i in six.moves.range(1, len(self.gpus)):
pipe, worker_end = multiprocessing.Pipe()
worker = _Worker(i, worker_end, self.model, self.gpus, self.da, int(float(self.batch) / len(self.gpus) / self.train_batch_divide), self)
worker.start()
self._workers.append(worker)
self._pipes.append(pipe)
with cuda.Device(self.gpus[0]):
self.model.to_gpu(self.gpus[0])
if len(self.gpus) > 1:
communication_id = nccl.get_unique_id()
self._send_message(("set comm_id", communication_id))
self.communication = nccl.NcclCommunicator(len(self.gpus),
communication_id,
0)
python类Device()的实例源码
nutszebra_ilsvrc_object_localization_with_multi_gpus.py 文件源码
项目:trainer
作者: nutszebra
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def setup_workers(self):
# work only once
if self._initialized:
return
self._initialized = True
self.model.zerograds()
for i in six.moves.range(1, len(self.gpus)):
pipe, worker_end = multiprocessing.Pipe()
worker = _Worker(i, worker_end, self.model, self.gpus, self.da, int(self.batch / len(self.gpus) / self.train_batch_divide), self)
worker.start()
self._workers.append(worker)
self._pipes.append(pipe)
with cuda.Device(self.gpus[0]):
self.model.to_gpu(self.gpus[0])
if len(self.gpus) > 1:
communication_id = nccl.get_unique_id()
self._send_message(("set comm_id", communication_id))
self.communication = nccl.NcclCommunicator(len(self.gpus),
communication_id,
0)
def _inv_gpu(b):
# We do a batched LU decomposition on the GPU to compute the inverse
# Change the shape of the array to be size=1 minibatch if necessary
# Also copy the matrix as the elments will be modified in-place
a = matmul._as_batch_mat(b).copy()
n = a.shape[1]
n_matrices = len(a)
# Pivot array
p = cuda.cupy.empty((n, n_matrices), dtype=numpy.int32)
# Output array
c = cuda.cupy.empty_like(a)
# These arrays hold information on the execution success
# or if the matrix was singular
info = cuda.cupy.empty(n_matrices, dtype=numpy.int32)
ap = matmul._mat_ptrs(a)
cp = matmul._mat_ptrs(c)
_, lda = matmul._get_ld(a)
_, ldc = matmul._get_ld(c)
handle = cuda.Device().cublas_handle
cuda.cublas.sgetrfBatched(
handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices)
cuda.cublas.sgetriBatched(
handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc,
info.data.ptr, n_matrices)
return c, info
def forward_gpu(self, inputs):
x = inputs[0]
W = inputs[1]
# Prepare BLAS call
handle = cuda.Device().cublas_handle
k, m = W.shape
n, l = x.shape[0] * x.shape[1], x.shape[2]
lda = max(1, x.shape[-1])
ldb = max(1, W.strides[0] // W.dtype.itemsize)
ldc = max(1, m)
Wx = cupy.empty((x.shape[0], x.shape[1], W.shape[1]),
dtype=numpy.float32)
sgemm(handle, False, False, m, n, k, 1, W.data.ptr, ldb,
x.data.ptr, lda, 0, Wx.data.ptr, ldc)
if len(inputs) > 2:
b = inputs[2]
Wx += b
return Wx,
def forward_gpu(self, inputs):
x = inputs[0]
W = inputs[1]
# Prepare BLAS call
handle = cuda.Device().cublas_handle
k, m = W.shape
n, l = x.shape[0] * x.shape[1], x.shape[2]
lda = max(1, x.shape[-1])
ldb = max(1, W.strides[0] // W.dtype.itemsize)
ldc = max(1, m)
Wx = cupy.empty((x.shape[0], x.shape[1], W.shape[1]),
dtype=numpy.float32)
sgemm(handle, False, False, m, n, k, 1, W.data.ptr, ldb,
x.data.ptr, lda, 0, Wx.data.ptr, ldc)
if len(inputs) > 2:
b = inputs[2]
Wx += b
return Wx,
def forward(self, inputs):
xp = cuda.get_array_module(*inputs)
x0, x1 = inputs
self.diff = self.inside_weights * (x0 - x1)
abs_diff = xp.abs(self.diff)
flag = abs_diff < 1.0 / self.sigma2
y = (flag * 0.5 * xp.square(self.diff) * self.sigma2 +
(~flag) * (abs_diff - 0.5 / self.sigma2))
if xp == cuda.cupy:
with cuda.Device(cuda.get_device(y)):
num = xp.prod(xp.asarray(y.shape))
else:
num = xp.prod(y.shape)
return xp.array(y.sum() / num).astype(numpy.float32),
def bbox_transform_inv(boxes, deltas, gpu=-1):
if gpu >= 0:
with cuda.Device(gpu):
return _bbox_transform_inv(boxes, deltas)
else:
return _bbox_transform_inv(boxes, deltas)
def clip_boxes(boxes, im_shape, gpu=-1):
if gpu >= 0:
with cuda.Device(gpu):
return _clip_boxes(boxes, im_shape)
else:
return _clip_boxes(boxes, im_shape)
def _batch_matmul_gpu(a, b, out, transa=False, transb=False, transout=False):
a = _as_batch_mat(cuda.cupy.ascontiguousarray(a))
b = _as_batch_mat(cuda.cupy.ascontiguousarray(b))
trans_axis = (0, 2, 1)
if transout:
out = out.transpose(trans_axis)
needtrans, _ = _get_ld(out)
if needtrans == 1:
# (A B)^T = B^T A^T
a, b = b, a
transa, transb = not transb, not transa
out = out.transpose(trans_axis)
if transa:
a = a.transpose(trans_axis)
if transb:
b = b.transpose(trans_axis)
transa, lda = _get_ld(a)
transb, ldb = _get_ld(b)
transout, ldout = _get_ld(out)
la, n, ka = a.shape
lb, kb, m = b.shape
assert ka == kb
assert transout == 0 or ldout == 1
assert out.shape == (la, n, m)
ap = _mat_ptrs(a)
bp = _mat_ptrs(b)
outp = _mat_ptrs(out)
cuda.cublas.sgemmBatched(
cuda.Device().cublas_handle,
transa,
transb,
n, m, ka, 1.0,
ap.data.ptr, lda,
bp.data.ptr, ldb,
0.0, outp.data.ptr, ldout, la)
def _det_gpu(b):
# We do a batched LU decomposition on the GPU to compute
# and compute the determinant by multiplying the diagonal.
# Change the shape of the array to be size=1 minibatch if necessary.
# Also copy the matrix as the elments will be modified in-place.
a = matmul._as_batch_mat(b).copy()
n = a.shape[1]
n_matrices = len(a)
# Pivot array
p = cuda.cupy.zeros((n_matrices, n), dtype='int32')
# Output array
# These arrays hold information on the execution success
# or if the matrix was singular.
info = cuda.cupy.zeros(n_matrices, dtype=numpy.intp)
ap = matmul._mat_ptrs(a)
_, lda = matmul._get_ld(a)
cuda.cublas.sgetrfBatched(cuda.Device().cublas_handle, n, ap.data.ptr, lda,
p.data.ptr, info.data.ptr, n_matrices)
det = cuda.cupy.prod(a.diagonal(axis1=1, axis2=2), axis=1)
# The determinant is equal to the product of the diagonal entries
# of `a` where the sign of `a` is flipped depending on whether
# the pivot array is equal to its index.
rng = cuda.cupy.arange(1, n + 1, dtype='int32')
parity = cuda.cupy.sum(p != rng, axis=1) % 2
sign = 1. - 2. * parity.astype('float32')
return det * sign, info
test_optimizers_by_linear_model.py 文件源码
项目:chainer-deconv
作者: germanRos
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def test_linear_model_multi_gpu(self):
with cuda.Device(0):
self.assertGreater(
cuda.to_cpu(self.model.accuracy_gpu(1).data), 0.9)
test_optimizers_by_linear_model.py 文件源码
项目:chainer-deconv
作者: germanRos
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def test_model_setup_multi_gpu(self):
with cuda.Device(0):
model = self.model.model
optimizer = self.model.optimizer
model.to_gpu(1)
optimizer.setup(model)
for name, param in optimizer.target.namedparams():
for v in six.itervalues(optimizer._states[name]):
self.assertEqual(int(param.data.device), int(v.device))
def check_accumulate_grads_from_gpu(self, src_id):
with cuda.Device(src_id):
self.optimizer.accumulate_grads([cuda.cupy.arange(3)])
grad = self.target.param.grad
self.assertTrue((cuda.to_cpu(grad) == np.arange(3) * 2).all())
def test_accumulate_grads_gpu_to_cpu(self):
self.setup_cpu()
self.check_accumulate_grads_from_gpu(cuda.Device().id)
def test_accumulate_grads_gpu_to_gpu(self):
device_id = cuda.Device().id
self.setup_gpu(device_id)
self.check_accumulate_grads_from_gpu(device_id)
def test_copy_parameters_from_cpu_to_gpu(self):
self.check_copy_parameters_from(-1, cuda.Device().id)
def test_copy_parameters_from_gpu_to_cpu(self):
self.check_copy_parameters_from(cuda.Device().id, -1)
def test_forward_gpu(self):
device_id = cuda.Device().id
self.check_forward(device_id, device_id)
def test_check_backward_gpu(self):
device_id = cuda.Device().id
self.check_forward(device_id, device_id)
def test_forward_cpu_to_gpu(self):
device_id = cuda.Device().id
self.check_forward(-1, device_id)
def test_backward_cpu_to_gpu(self):
device_id = cuda.Device().id
self.check_backward(-1, device_id)
def test_forward_gpu_to_cpu(self):
device_id = cuda.Device().id
self.check_forward(device_id, -1)
def backward_gpu(self, inputs, gy):
x = inputs[0]
W = inputs[1]
# Backprop weight
gW = cuda.cupy.empty_like(W)
handle = cuda.Device().cublas_handle
k, n = gy[0].shape[0] * gy[0].shape[1], W.shape[0]
m = W.shape[1]
lda = max(1, x.shape[-1])
ldb = max(1, gy[0].shape[-1])
ldc = max(1, m)
sgemm(handle, False, True, m, n, k, 1, gy[0].data.ptr, ldb,
x.data.ptr, lda, 1, gW.data.ptr, ldc)
# Backprop input
m, k = W.shape
n, l = x.shape[0] * x.shape[1], gy[0].shape[2]
lda = max(1, gy[0].shape[-1])
ldb = max(1, W.shape[1])
ldc = max(1, m)
gx = cuda.cupy.empty_like(x)
sgemm(handle, True, False, m, n, k, 1, W.data.ptr, ldb,
gy[0].data.ptr, lda, 0, gx.data.ptr, ldc)
# Backprop bias
if len(inputs) > 2:
gy_2d = _as_mat(gy[0])
gb = gy_2d.sum(0)
return gx, gW, gb
else:
return gx, gW
def backward_gpu(self, inputs, gy):
x = inputs[0]
W = inputs[1]
# Backprop weight
gW = cuda.cupy.empty_like(W)
handle = cuda.Device().cublas_handle
k, n = gy[0].shape[0] * gy[0].shape[1], W.shape[0]
m = W.shape[1]
lda = max(1, x.shape[-1])
ldb = max(1, gy[0].shape[-1])
ldc = max(1, m)
sgemm(handle, False, True, m, n, k, 1, gy[0].data.ptr, ldb,
x.data.ptr, lda, 1, gW.data.ptr, ldc)
# Backprop input
m, k = W.shape
n, l = x.shape[0] * x.shape[1], gy[0].shape[2]
lda = max(1, gy[0].shape[-1])
ldb = max(1, W.shape[1])
ldc = max(1, m)
gx = cuda.cupy.empty_like(x)
sgemm(handle, True, False, m, n, k, 1, W.data.ptr, ldb,
gy[0].data.ptr, lda, 0, gx.data.ptr, ldc)
# Backprop bias
if len(inputs) > 2:
gy_2d = _as_mat(gy[0])
gb = gy_2d.sum(0)
return gx, gW, gb
else:
return gx, gW
def bbox_transform_inv(boxes, deltas, gpu=-1):
if gpu >= 0:
with cuda.Device(gpu):
return _bbox_transform_inv(boxes, deltas)
else:
return _bbox_transform_inv(boxes, deltas)
def clip_boxes(boxes, im_shape, gpu=-1):
if gpu >= 0:
with cuda.Device(gpu):
return _clip_boxes(boxes, im_shape)
else:
return _clip_boxes(boxes, im_shape)
nutszebra_ilsvrc_object_localization_with_multi_gpus.py 文件源码
项目:trainer
作者: nutszebra
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def run(self):
dev = cuda.Device(self.device)
dev.use()
# build communication via nccl
self.setup()
gp = None
p = multiprocessing.Pool(self.parallel_train)
args_da = [self.da() for _ in six.moves.range(self.batch)]
while True:
job, data = self.pipe.recv()
if job == 'finalize':
dev.synchronize()
break
if job == 'update':
# for reducing memory
self.model.cleargrads()
indices = list(self.sampling.yield_random_batch_from_category(1, self.picture_number_at_each_categories, self.batch, shuffle=True))[0]
x = self.train_x[indices]
t = self.train_y[indices]
args = list(zip(x, t, args_da))
processed = p.starmap(process_train, args)
tmp_x, tmp_t = list(zip(*processed))
train = True
x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.device)
t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.device)
y = self.model(x, train=train)
loss = self.model.calc_loss(y, t) / self.number_of_devices / self.train_batch_divide
loss.backward()
del x
del t
del y
del loss
# send gradients of self.model
gg = gather_grads(self.model)
null_stream = cuda.Stream.null
self.communication.reduce(gg.data.ptr,
gg.data.ptr,
gg.size,
nccl.NCCL_FLOAT,
nccl.NCCL_SUM,
0,
null_stream.ptr)
del gg
self.model.cleargrads()
# send parameters of self.model
gp = gather_params(self.model)
self.communication.bcast(gp.data.ptr,
gp.size,
nccl.NCCL_FLOAT,
0,
null_stream.ptr)
scatter_params(self.model, gp)
gp = None
nutszebra_ilsvrc_object_localization_with_multi_gpus.py 文件源码
项目:trainer
作者: nutszebra
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def update_core(self, x, t, p, args_da):
self._send_message(('update', None))
with cuda.Device(self.gpus[0]):
self.model.cleargrads()
args = list(zip(x, t, args_da))
processed = p.starmap(process_train, args)
tmp_x, tmp_t = list(zip(*processed))
data_length = len(tmp_x)
train = True
x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.gpus[0])
t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.gpus[0])
y = self.model(x, train=train)
loss = self.model.calc_loss(y, t) / len(self.gpus)
loss.backward()
loss.to_cpu()
loss = float(loss.data) * data_length
del x
del t
del y
# NCCL: reduce grads
null_stream = cuda.Stream.null
if self.communication is not None:
# send grads
gg = gather_grads(self.model)
self.communication.reduce(gg.data.ptr,
gg.data.ptr,
gg.size,
nccl.NCCL_FLOAT,
nccl.NCCL_SUM,
0,
null_stream.ptr)
# copy grads, gg, to self.model
scatter_grads(self.model, gg)
del gg
self.optimizer.update()
if self.communication is not None:
gp = gather_params(self.model)
self.communication.bcast(gp.data.ptr,
gp.size,
nccl.NCCL_FLOAT,
0,
null_stream.ptr)
return loss
def run(self):
dev = cuda.Device(self.device)
dev.use()
# build communication via nccl
self.setup()
gp = None
da_args = [self.da() for _ in six.moves.range(self.batch)]
p = multiprocessing.Pool(self.parallel)
batch_of_batch = int(float(self.batch) / self.train_batch_divide)
while True:
job, data = self.pipe.recv()
if job == 'finalize':
dev.synchronize()
break
if job == 'update':
# for reducing memory
self.model.zerograds()
indices = list(self.sampling.yield_random_batch_samples(1, self.batch, len(self.train_x), sort=False))[0]
for ii in six.moves.range(0, len(indices), batch_of_batch):
x = self.train_x[indices[ii:ii + batch_of_batch]]
t = self.train_y[indices[ii:ii + batch_of_batch]]
args = list(six.moves.zip(x, t, da_args))
processed = p.starmap(process_train, args)
tmp_x, tmp_t = list(zip(*processed))
train = True
x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.device)
t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.device)
y = self.model(x, train=train)
loss = self.model.calc_loss(y, t) / self.number_of_devices / self.train_batch_divide
loss.backward()
del x
del t
del y
del loss
# send gradients of self.model
gg = gather_grads(self.model)
null_stream = cuda.Stream.null
self.communication.reduce(gg.data.ptr,
gg.data.ptr,
gg.size,
nccl.NCCL_FLOAT,
nccl.NCCL_SUM,
0,
null_stream.ptr)
del gg
self.model.zerograds()
# send parameters of self.model
gp = gather_params(self.model)
self.communication.bcast(gp.data.ptr,
gp.size,
nccl.NCCL_FLOAT,
0,
null_stream.ptr)
scatter_params(self.model, gp)
gp = None