def bn(data, name, eps=1.001e-5, fix_gamma=False, use_global_stats=None):
if use_global_stats is None:
use_global_stats = cfg.get('bn_use_global_stats', False)
if fix_gamma:
with mx.AttrScope(lr_mult='0.', wd_mult='0.'):
gamma = mx.sym.Variable('{}_gamma'.format(name))
beta = mx.sym.Variable('{}_beta'.format(name))
return mx.sym.BatchNorm(data=data, gamma=gamma, beta=beta, name=name,
eps=eps,
fix_gamma=True,
use_global_stats=use_global_stats)
else:
lr_type = cfg.get('lr_type', 'torch')
with _attr_scope_lr(lr_type, 'weight'):
gamma = mx.sym.Variable('{}_gamma'.format(name))
with _attr_scope_lr(lr_type, 'bias'):
beta = mx.sym.Variable('{}_beta'.format(name))
return mx.sym.BatchNorm(data=data, gamma=gamma, beta=beta, name=name,
eps=eps,
fix_gamma=False,
use_global_stats=use_global_stats)
python类AttrScope()的实例源码
def encode(self,
data: mx.sym.Symbol,
data_length: Optional[mx.sym.Symbol],
seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
"""
Encodes data given sequence lengths of individual examples and maximum sequence length.
:param data: Input data.
:param data_length: Vector with sequence lengths.
:param seq_len: Maximum sequence length.
:return: Encoded versions of input data (data, data_length, seq_len).
"""
with mx.AttrScope(__layout__=C.TIME_MAJOR):
return mx.sym.swapaxes(data=data, dim1=0, dim2=1), data_length, seq_len
def test_ctx_group():
with mx.AttrScope(ctx_group='stage1'):
data = mx.symbol.Variable('data')
fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
set_stage1 = set(act1.list_arguments())
with mx.AttrScope(ctx_group='stage2'):
fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
fc3 = mx.symbol.BatchNorm(fc3)
mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
set_stage2 = set(mlp.list_arguments()) - set_stage1
group2ctx = {
'stage1' : mx.cpu(1),
'stage2' : mx.cpu(2)
}
texec = mlp.simple_bind(mx.cpu(0),
group2ctx=group2ctx,
data=(1,200))
for arr, name in zip(texec.arg_arrays, mlp.list_arguments()):
if name in set_stage1:
assert arr.context == group2ctx['stage1']
else:
assert arr.context == group2ctx['stage2']
def test_chain():
n = 2
data1 = mx.sym.Variable('data1')
data2 = mx.sym.Variable('data2')
with mx.AttrScope(ctx_group='dev1'):
net = data1 + data2
net = net * 3
with mx.AttrScope(ctx_group='dev2'):
net = net + data1
with mx.Context(mx.cpu(0)):
shape = (4, 5)
arr = [mx.nd.empty(shape) for i in range(n)]
arr_grad = [mx.nd.empty(shape) for i in range(n)]
exec1 = net.bind(mx.cpu(),
args=arr,
args_grad=arr_grad,
group2ctx={'dev1': mx.cpu(0), 'dev2': mx.cpu(1)})
arr[0][:] = 1.0
arr[1][:] = 2.0
arr2 = [a.copyto(mx.cpu()) for a in arr]
arr_grad2 = [a.copyto(mx.cpu()) for a in arr_grad]
exec2 = net.bind(mx.cpu(),
args=arr2,
args_grad=arr_grad2)
# Show the execution plan that involves copynode
print(exec1.debug_str())
exec1.forward()
exec2.forward()
assert reldiff(exec1.outputs[0].asnumpy(), exec2.outputs[0].asnumpy()) < 1e-6
out_grad = mx.nd.empty(shape, mx.cpu(1))
out_grad[:] = 1.0
exec1.backward([out_grad])
exec2.backward([out_grad.copyto(mx.cpu())])
for a, b in zip(arr_grad, arr_grad2):
assert reldiff(a.asnumpy(), b.asnumpy()) < 1e-6
def test_attr_basic():
with mx.AttrScope(group='4', data='great'):
data = mx.symbol.Variable('data',
attr={'dtype':'data',
'group': '1'})
gdata = mx.symbol.Variable('data2')
assert gdata.attr('group') == '4'
assert data.attr('group') == '1'
data2 = pkl.loads(pkl.dumps(data))
assert data.attr('dtype') == data2.attr('dtype')
def test_operator():
data = mx.symbol.Variable('data')
with mx.AttrScope(group='4', data='great'):
fc1 = mx.symbol.Activation(data, act_type='relu')
with mx.AttrScope(init_bias='0.0'):
fc2 = mx.symbol.FullyConnected(fc1, num_hidden=10, name='fc2')
assert fc1.attr('data') == 'great'
assert fc2.attr('data') == 'great'
assert fc2.attr('init_bias') == '0.0'
fc2copy = pkl.loads(pkl.dumps(fc2))
assert fc2copy.tojson() == fc2.tojson()
fc2weight = fc2.get_internals()['fc2_weight']
def _attr_scope_lr(lr_type, lr_owner):
assert lr_type in ('alex', 'alex10', 'torch')
# weight (lr_mult, wd_mult); bias;
# 1, 1; 2, 0;
if lr_type == 'alex':
if lr_owner == 'weight':
return mx.AttrScope()
elif lr_owner == 'bias':
return mx.AttrScope(lr_mult='2.', wd_mult='0.')
else:
assert False
# 10, 1; 20, 0;
if lr_type == 'alex10':
if lr_owner == 'weight':
return mx.AttrScope(lr_mult='10.', wd_mult='1.')
elif lr_owner == 'bias':
return mx.AttrScope(lr_mult='20.', wd_mult='0.')
else:
assert False
# 0, 0; 0, 0;
# so apply this to both
if lr_type == 'fixed':
assert lr_owner in ('weight', 'bias')
return mx.AttrScope(lr_mult='0.', wd_mult='0.')
# 1, 1; 1, 1;
# so do nothing
return mx.AttrScope()
def get_dssm():
doc_pos = mx.sym.Variable('doc_pos')
doc_neg = mx.sym.Variable('doc_neg')
data_usr = mx.sym.Variable("data_usr", stype='csr')
#with mx.AttrScope(ctx_group="cpu"):
w_usr = mx.sym.Variable('usr_weight', stype='row_sparse', shape=(USR_NUM, OUT_DIM))
# shared weights
w1 = mx.sym.Variable('fc1_doc_weight')
w2 = mx.sym.Variable('fc2_doc_weight')
w3 = mx.sym.Variable('fc3_doc_weight')
b1 = mx.sym.Variable('fc1_doc_bias')
b2 = mx.sym.Variable('fc2_doc_bias')
b3 = mx.sym.Variable('fc3_doc_bias')
def cosine(usr, doc):
dot = usr * doc
dot = mx.sym.sum_axis(dot, axis=1)
return dot
def doc_mlp(data):
fc1 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, name='fc1', weight=w1, bias=b1)
fc1 = mx.sym.Activation(data=fc1, act_type='relu')
fc2 = mx.sym.FullyConnected(data=fc1, num_hidden=num_hidden, name='fc2', weight=w2, bias=b2)
fc2 = mx.sym.Activation(data=fc2, act_type='relu')
fc3 = mx.sym.FullyConnected(data=fc2, num_hidden=OUT_DIM, name='fc3', weight=w3, bias=b3)
fc3 = mx.sym.Activation(data=fc3, act_type='relu')
fc3 = mx.sym.L2Normalization(data=fc3)
return fc3
# usr net
#with mx.AttrScope(ctx_group="cpu"):
usr1 = mx.sym.dot(data_usr, w_usr)
usr = mx.sym.L2Normalization(data=usr1)
# doc net
mlp_pos = doc_mlp(doc_pos)
mlp_neg = doc_mlp(doc_neg)
cosine_pos = cosine(usr, mlp_pos)
cosine_neg = cosine(usr, mlp_neg)
exp = mx.sym.exp(data=(cosine_neg - cosine_pos))
pred = mx.sym.log1p(data=exp)
out = mx.sym.MAERegressionOutput(data=pred, name='mae')
return out
def lstm_unroll(num_lstm_layer,
num_hidden, dropout=0.,
concat_decode=True, use_loss=False):
"""unrolled lstm network"""
with mx.AttrScope(ctx_group='decode'):
cls_weight = mx.sym.Variable("cls_weight")
cls_bias = mx.sym.Variable("cls_bias")
param_cells = []
last_states = []
for i in range(num_lstm_layer):
with mx.AttrScope(ctx_group='layer%d' % i):
param_cells.append(LSTMParam(i2h_weight = mx.sym.Variable("l%d_i2h_weight" % i),
i2h_bias = mx.sym.Variable("l%d_i2h_bias" % i),
h2h_weight = mx.sym.Variable("l%d_h2h_weight" % i),
h2h_bias = mx.sym.Variable("l%d_h2h_bias" % i)))
state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
h=mx.sym.Variable("l%d_init_h" % i))
last_states.append(state)
# stack LSTM
hidden = mx.sym.SliceChannel(data=mx.sym.Variable("data"), num_outputs=MAX_LEN, squeeze_axis=0)
for i in range(num_lstm_layer):
next_hidden = []
for t in range(MAX_LEN):
with mx.AttrScope(ctx_group='layer%d' % i):
next_state = lstm(n_hidden, indata=hidden[t],
prev_state=last_states[i],
param=param_cells[i],
layeridx=i, dropout=0.)
next_hidden.append(next_state.h)
last_states[i] = next_state
hidden = next_hidden[:]
sm = []
labels = mx.sym.SliceChannel(data=mx.sym.Variable("labels"), num_outputs=MAX_LEN, squeeze_axis=0)
for t in range(MAX_LEN):
fc = mx.sym.FullyConnected(data=hidden[t],
weight=cls_weight,
bias=cls_bias,
num_hidden=n_classes)
sm.append(mx.sym.softmax_cross_entropy(fc, labels[t], name="sm"))
for i in range(num_lstm_layer):
state = last_states[i]
state = LSTMState(c=mx.sym.BlockGrad(state.c, name="l%d_last_c" % i),
h=mx.sym.BlockGrad(state.h, name="l%d_last_h" % i))
last_states[i] = state
unpack_c = [state.c for state in last_states]
unpack_h = [state.h for state in last_states]
list_all = sm + unpack_c + unpack_h
return mx.sym.Group(list_all)