def check_elementwise_sum_with_shape(shape, n):
# forward
inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)]
out = mx.symbol.ElementWiseSum(*inputs, name='esum')
arr = [mx.nd.empty(shape) for i in range(n)]
arr_grad = [mx.nd.empty(shape) for i in range(n)]
for i in range(n):
arr[i][:] = np.random.uniform(-10, 10, shape)
exec1 = out.bind(mx.Context('cpu'),
args=arr,
args_grad=arr_grad)
out1 = exec1.outputs[0].asnumpy()
exec1.forward()
out1 = exec1.outputs[0].asnumpy()
out = sum(a.asnumpy() for a in arr)
assert reldiff(out, out1) < 1e-6
out_grad = mx.nd.empty(shape)
out_grad[:] = np.random.uniform(-10, 10, shape)
# backward
exec1.backward([out_grad])
for a in arr_grad:
assert same(a.asnumpy(), out_grad.asnumpy())
python类Context()的实例源码
def __init__(self,
config: model.ModelConfig,
context: List[mx.context.Context],
train_iter: data_io.BaseParallelSampleIter,
bucketing: bool,
lr_scheduler,
gradient_compression_params: Optional[Dict[str, Any]] = None) -> None:
super().__init__(config)
self.context = context
self.lr_scheduler = lr_scheduler
self.bucketing = bucketing
self.gradient_compression_params = gradient_compression_params
self._build_model_components()
self.module = self._build_module(train_iter)
self.training_monitor = None # type: Optional[callback.TrainingMonitor]
def determine_context(args: argparse.Namespace, exit_stack: ExitStack) -> List[mx.Context]:
"""
Determine the context we should run on (CPU or GPU).
:param args: Arguments as returned by argparse.
:param exit_stack: An ExitStack from contextlib.
:return: A list with the context(s) to run on.
"""
if args.use_cpu:
logger.info("Training Device: CPU")
context = [mx.cpu()]
else:
num_gpus = utils.get_num_gpus()
check_condition(num_gpus >= 1,
"No GPUs found, consider running on the CPU with --use-cpu "
"(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
"binary isn't on the path).")
if args.disable_device_locking:
context = utils.expand_requested_device_ids(args.device_ids)
else:
context = exit_stack.enter_context(utils.acquire_gpus(args.device_ids, lock_dir=args.lock_dir))
if args.batch_type == C.BATCH_TYPE_SENTENCE:
check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be "
"divisible by the number of devices. Choose a batch "
"size that is a multiple of %d." % len(context))
logger.info("Training Device(s): GPU %s", context)
context = [mx.gpu(gpu_id) for gpu_id in context]
return context
def create_training_model(model_config: model.ModelConfig,
args: argparse.Namespace,
context: List[mx.Context],
train_iter: data_io.BaseParallelSampleIter,
lr_scheduler_instance: lr_scheduler.LearningRateScheduler,
resume_training: bool,
training_state_dir: str) -> training.TrainingModel:
"""
Create a training model and load the parameters from disk if needed.
:param model_config: The configuration for the model.
:param args: Arguments as returned by argparse.
:param context: The context(s) to run on.
:param train_iter: The training data iterator.
:param lr_scheduler_instance: The learning rate scheduler.
:param resume_training: When True, the model will be loaded from disk.
:param training_state_dir: Directory where the training state is stored.
:return: The training model.
"""
training_model = training.TrainingModel(config=model_config,
context=context,
train_iter=train_iter,
bucketing=not args.no_bucketing,
lr_scheduler=lr_scheduler_instance,
gradient_compression_params=gradient_compression_params(args))
# We may consider loading the params in TrainingModule, for consistency
# with the training state saving
if resume_training:
logger.info("Found partial training in directory %s. Resuming from saved state.", training_state_dir)
training_model.load_params_from_file(os.path.join(training_state_dir, C.TRAINING_STATE_PARAMS_NAME))
elif args.params:
logger.info("Training will initialize from parameters loaded from '%s'", args.params)
training_model.load_params_from_file(args.params)
return training_model
def __init__(self, num_classes, data_shape, max_iter, dtype):
self.batch_size = data_shape[0]
self.cur_iter = 0
self.max_iter = max_iter
self.dtype = dtype
label = np.random.randint(0, num_classes, [self.batch_size,])
data = np.random.uniform(-1, 1, data_shape)
self.data = mx.nd.array(data, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
self.label = mx.nd.array(label, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
def __init__(self, num_classes, data_shape, max_iter, dtype):
self.batch_size = data_shape[0]
self.cur_iter = 0
self.max_iter = max_iter
self.dtype = dtype
label = np.random.randint(0, num_classes, [self.batch_size,])
data = np.random.uniform(-1, 1, data_shape)
self.data = mx.nd.array(data, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
self.label = mx.nd.array(label, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
def test_aggregator():
"""aggregate value on muliple devices"""
kv = init_kv()
# devices
num_devs = 4
devs = [mx.Context('cpu', i) for i in range(num_devs)]
# single
vals = [mx.nd.ones(shape, d) for d in devs]
kv.push(3, vals)
kv.pull(3, out = vals)
for v in vals:
check_diff_to_scalar(v, num_devs)
# list
vals = [[mx.nd.ones(shape, d)*2.0 for d in devs]] * len(keys)
kv.push(keys, vals)
kv.pull(keys, out = vals)
for vv in vals:
for v in vv:
check_diff_to_scalar(v, num_devs * 2.0)
def test_updater(dev = 'cpu'):
"""updater"""
kv = init_kv()
kv._set_updater(updater)
# devices
num_devs = 4
devs = [mx.Context(dev, i) for i in range(num_devs)]
# single
vals = [mx.nd.ones(shape, d) for d in devs]
kv.push(3, vals)
kv.pull(3, out = vals)
for v in vals:
check_diff_to_scalar(v, num_devs)
# list
vals = [[mx.nd.ones(shape, d) for d in devs]] * len(keys)
num_push = 4
for i in range(num_push):
kv.push(keys, vals)
kv.pull(keys, out = vals)
for vv in vals:
for v in vv:
check_diff_to_scalar(v, num_devs * num_push)
def test_ndarray_copy():
c = mx.nd.array(np.random.uniform(-10, 10, (10, 10)))
d = c.copyto(mx.Context('cpu', 0))
assert np.sum(np.abs(c.asnumpy() != d.asnumpy())) == 0.0
def test_chain():
n = 2
data1 = mx.sym.Variable('data1')
data2 = mx.sym.Variable('data2')
with mx.AttrScope(ctx_group='dev1'):
net = data1 + data2
net = net * 3
with mx.AttrScope(ctx_group='dev2'):
net = net + data1
with mx.Context(mx.cpu(0)):
shape = (4, 5)
arr = [mx.nd.empty(shape) for i in range(n)]
arr_grad = [mx.nd.empty(shape) for i in range(n)]
exec1 = net.bind(mx.cpu(),
args=arr,
args_grad=arr_grad,
group2ctx={'dev1': mx.cpu(0), 'dev2': mx.cpu(1)})
arr[0][:] = 1.0
arr[1][:] = 2.0
arr2 = [a.copyto(mx.cpu()) for a in arr]
arr_grad2 = [a.copyto(mx.cpu()) for a in arr_grad]
exec2 = net.bind(mx.cpu(),
args=arr2,
args_grad=arr_grad2)
# Show the execution plan that involves copynode
print(exec1.debug_str())
exec1.forward()
exec2.forward()
assert reldiff(exec1.outputs[0].asnumpy(), exec2.outputs[0].asnumpy()) < 1e-6
out_grad = mx.nd.empty(shape, mx.cpu(1))
out_grad[:] = 1.0
exec1.backward([out_grad])
exec2.backward([out_grad.copyto(mx.cpu())])
for a, b in zip(arr_grad, arr_grad2):
assert reldiff(a.asnumpy(), b.asnumpy()) < 1e-6
def __init__(self, num_classes, data_shape, max_iter, dtype):
self.batch_size = data_shape[0]
self.cur_iter = 0
self.max_iter = max_iter
self.dtype = dtype
label = np.random.randint(0, num_classes, [self.batch_size,])
data = np.random.uniform(-1, 1, data_shape)
self.data = mx.nd.array(data, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
self.label = mx.nd.array(label, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
def determine_decode_and_evaluate_context(args: argparse.Namespace,
exit_stack: ExitStack,
train_context: List[mx.Context]) -> Tuple[int, Optional[mx.Context]]:
"""
Determine the number of sentences to decode and the context we should run on (CPU or GPU).
:param args: Arguments as returned by argparse.
:param exit_stack: An ExitStack from contextlib.
:param train_context: Context for training.
:return: The number of sentences to decode and a list with the context(s) to run on.
"""
num_to_decode = args.decode_and_evaluate
if args.optimized_metric == C.BLEU and num_to_decode == 0:
logger.info("You chose BLEU as the optimized metric, will turn on BLEU monitoring during training. "
"To control how many validation sentences are used for calculating bleu use "
"the --decode-and-evaluate argument.")
num_to_decode = -1
if num_to_decode == 0:
return 0, None
if args.use_cpu or args.decode_and_evaluate_use_cpu:
context = mx.cpu()
elif args.decode_and_evaluate_device_id is not None:
# decode device is defined from the commandline
num_gpus = utils.get_num_gpus()
check_condition(num_gpus >= 1,
"No GPUs found, consider running on the CPU with --use-cpu "
"(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
"binary isn't on the path).")
if args.disable_device_locking:
context = utils.expand_requested_device_ids([args.decode_and_evaluate_device_id])
else:
context = exit_stack.enter_context(utils.acquire_gpus([args.decode_and_evaluate_device_id],
lock_dir=args.lock_dir))
context = mx.gpu(context[0])
else:
# default decode context is the last training device
context = train_context[-1]
logger.info("Decode and Evaluate Device(s): %s", context)
return num_to_decode, context
def check_concat_with_shape(shapes, dimension, skip_second):
# if skip_second is True, second argument will not have gradient.
# it is to test #1130
n = len(shapes)
# forward
target_dim = 0
for shape in shapes:
target_dim += shape[dimension]
inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)]
out = mx.symbol.Concat(*inputs, name='conc',dim=dimension)
arr = [mx.nd.empty(shape) for shape in shapes]
for i in range(n):
arr[i][:] = shapes[i][dimension]
arr_np = [np.copy(narray.asnumpy()) for narray in arr]
arr_grad = [mx.nd.empty(shape) for shape in shapes]
dict_grad = {}
arg_names = out.list_arguments()
for name, g in zip(arg_names, arr_grad):
if not skip_second or name != 'arg1':
dict_grad[name] = g
args = out.list_arguments()
arg_shapes, out_shapes, aux_shapes = out.infer_shape(**dict(zip(args, shapes)))
out_grad = mx.nd.empty(out_shapes[0])
exec1 = out.bind(mx.Context('cpu'),
args=arr,
args_grad=dict_grad)
exec1.forward()
out1 = exec1.outputs[0]
ret = np.concatenate([narray.asnumpy() for narray in arr], axis=dimension)
assert same(out1.asnumpy(), ret)
# backward
out1.copyto(out_grad)
out_grad[:] += 1
exec1.backward([out_grad])
for i, name in enumerate(arg_names):
if not skip_second or name != 'arg1':
grad = dict_grad[name]
np_grad = arr_np[i]
assert same(grad.asnumpy(), np_grad + 1)
def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
"""check function consistency with uniform random numbers"""
shape = tuple(np.random.randint(1, int(1000**(1.0/dim)), size=dim))
lhs = mx.symbol.Variable('lhs')
rhs = mx.symbol.Variable('rhs')
if sf is not None:
ret = sf(lhs, rhs)
else:
ret = uf(lhs, rhs)
assert ret.list_arguments() == ['lhs', 'rhs']
lshape = shape if lshape is None else lshape
rshape = shape if rshape is None else rshape
lhs_arr = mx.nd.array(np.random.uniform(-1, 1, lshape))
rhs_arr = mx.nd.array(np.random.uniform(-1, 1, rshape))
lhs_grad = mx.nd.empty(lshape)
rhs_grad = mx.nd.empty(rshape)
executor = ret.bind(mx.Context('cpu'),
args=[lhs_arr, rhs_arr],
args_grad=[lhs_grad, rhs_grad])
exec3 = ret.bind(mx.Context('cpu'),
args=[lhs_arr, rhs_arr])
exec4 = ret.bind(mx.Context('cpu'),
args={'rhs': rhs_arr, 'lhs': lhs_arr},
args_grad={'lhs': lhs_grad, 'rhs': rhs_grad})
executor.forward()
exec3.forward()
exec4.forward()
out2 = executor.outputs[0].asnumpy()
out1 = uf(lhs_arr.asnumpy(), rhs_arr.asnumpy())
out3 = exec3.outputs[0].asnumpy()
out4 = exec4.outputs[0].asnumpy()
assert reldiff(out1, out2) < 1e-6
assert reldiff(out1, out3) < 1e-6
assert reldiff(out1, out4) < 1e-6
# test gradient
out_grad = mx.nd.array(np.ones(out2.shape))
lhs_grad2, rhs_grad2 = gf(out_grad.asnumpy(),
lhs_arr.asnumpy(),
rhs_arr.asnumpy())
executor.backward([out_grad])
assert reldiff(lhs_grad.asnumpy(), lhs_grad2) < 1e-6
assert reldiff(rhs_grad.asnumpy(), rhs_grad2) < 1e-6