def torch_zeros_like(x):
"""
Polyfill for `torch.zeros_like()`.
"""
# Work around https://github.com/pytorch/pytorch/issues/2906
if isinstance(x, Variable):
return Variable(torch_zeros_like(x.data))
# Support Pytorch before https://github.com/pytorch/pytorch/pull/2489
try:
return torch.zeros_like(x)
except AttributeError:
return torch.zeros(x.size()).type_as(x)
python类zeros_like()的实例源码
def test_zeros_like(self):
expected = torch.zeros(100, 100)
res1 = torch.zeros_like(expected)
self.assertEqual(res1, expected)
res2 = torch.Tensor()
torch.zeros_like(expected, out=res2)
self.assertEqual(res2, expected)
def test_zeros_like_cuda(self):
expected = torch.zeros(100, 100).cuda()
res1 = torch.zeros_like(expected)
self.assertEqual(res1, expected)
res2 = torch.Tensor().cuda()
torch.zeros_like(expected, out=res2)
self.assertEqual(res2, expected)
def test_zeros_like_multiple_device(self):
expected = torch.zeros(100, 100).cuda()
x = torch.cuda.FloatTensor(100, 100, device=1)
output = torch.zeros_like(x)
self.assertEqual(output, expected)
def schedule_sampling(self, prev, dec_out):
"""
Resample n inputs to next iteration from the model itself. N is itself
sampled from a bernoulli independently for each example in the batch
with weights equal to the model's variable self.scheduled_rate.
Parameters:
-----------
- prev: torch.LongTensor(batch_size)
- dec_out: torch.Tensor(batch_size x hid_dim)
Returns: partially resampled input
--------
- prev: torch.LongTensor(batch_size)
"""
prev, dec_out = prev.data, dec_out.data # don't register computation
keep_mask = torch.bernoulli(
torch.zeros_like(prev).float() + self.exposure_rate) == 1
# return if no sampling is necessary
if len(keep_mask.nonzero()) == len(prev):
return prev
sampled = self.decoder.project(
Variable(dec_out, volatile=True)).max(1)[1].data
if keep_mask.nonzero().dim() == 0: # return all sampled
return sampled
keep_mask = keep_mask.nonzero().squeeze(1)
sampled[keep_mask] = prev[keep_mask]
return sampled
def word_dropout_mask(X, dropout_rate, reserved_codes=()):
"""
Computes a binary mask across batch examples based on a
bernoulli distribution with mean equal to dropout.
"""
probs = torch.zeros_like(X).float() + dropout_rate
# zero reserved_codes (avoid dropping reserved symbols)
if len(reserved_codes) > 0:
probs[sum((X == x) for x in reserved_codes)] = 0
# return binary mask
return torch.bernoulli(probs).byte()
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adadelta does not support sparse gradients')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
state['square_avg'] = torch.zeros_like(p.data)
state['acc_delta'] = torch.zeros_like(p.data)
square_avg, acc_delta = state['square_avg'], state['acc_delta']
rho, eps = group['rho'], group['eps']
state['step'] += 1
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
square_avg.mul_(rho).addcmul_(1 - rho, grad, grad)
std = square_avg.add(eps).sqrt_()
delta = acc_delta.add(eps).sqrt_().div_(std).mul_(grad)
p.data.add_(-group['lr'], delta)
acc_delta.mul_(rho).addcmul_(1 - rho, delta, delta)
return loss
def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0):
defaults = dict(lr=lr, lr_decay=lr_decay, weight_decay=weight_decay)
super(Adagrad, self).__init__(params, defaults)
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = 0
state['sum'] = torch.zeros_like(p.data)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
def _get_parameters(self, module):
params = []
d_params = []
for p in module.parameters():
if p.grad is None:
p._grad = torch.zeros_like(p)
params.append(p.data)
d_params.append(p.grad.data)
return params, d_params
def _analytical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
output = self._forward(module, input)
output_size = output.nelement()
output_t = output.data if isinstance(output, Variable) else output
if jacobian_input:
jacobian_inp = self._jacobian(input, output_size)
flat_jacobian_input = list(iter_tensors(jacobian_inp))
if jacobian_parameters:
num_param = sum(p.numel() for p in self._get_parameters(module)[0])
jacobian_param = torch.zeros(num_param, output_size)
for i in range(output_size):
_, d_param = self._get_parameters(module)
d_out = torch.zeros_like(output_t)
flat_d_out = d_out.view(-1)
flat_d_out[i] = 1
if jacobian_parameters:
self._zero_grad_parameters(module)
# Variables will accumulate gradient from multiple steps
if jacobian_input:
self._zero_grad_input(input)
d_input = self._backward(module, input, output, d_out)
if jacobian_input:
for jacobian_x, d_x in zip(flat_jacobian_input, iter_tensors(d_input)):
jacobian_x[:, i] = d_x
if jacobian_parameters:
jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
res = tuple()
if jacobian_input:
res += jacobian_inp,
if jacobian_parameters:
res += jacobian_param,
return res
def _test_zeros_like(self, template_shape_i, template_shape_v=None):
template_shape_v = template_shape_v or []
template_shape = template_shape_i + template_shape_v
for nnz in [9, 12]:
t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape)
res = torch.zeros_like(t)
self.assertEqual(tuple(res.size()), tuple(template_shape))
self.assertTrue(res._indices().numel() == res._values().numel() == 0)
self.assertEqual(res._nnz(), 0)
self.assertEqual(res._dimI(), len(template_shape_i))
self.assertEqual(res._dimV(), len(template_shape_v))
def test_zeros_like_cuda(self):
expected = torch.zeros(100, 100).cuda()
res1 = torch.zeros_like(expected)
self.assertEqual(res1, expected)
res2 = torch.Tensor().cuda()
torch.zeros_like(expected, out=res2)
self.assertEqual(res2, expected)
def test_zeros_like_multiple_device(self):
expected = torch.zeros(100, 100).cuda()
x = torch.cuda.FloatTensor(100, 100, device=1)
output = torch.zeros_like(x)
self.assertEqual(output, expected)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('RMSprop does not support sparse gradients')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
state['square_avg'] = torch.zeros_like(p.data)
if group['momentum'] > 0:
state['momentum_buffer'] = torch.zeros_like(p.data)
if group['centered']:
state['grad_avg'] = torch.zeros_like(p.data)
square_avg = state['square_avg']
alpha = group['alpha']
state['step'] += 1
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
if group['centered']:
grad_avg = state['grad_avg']
grad_avg.mul_(alpha).add_(1 - alpha, grad)
avg = square_avg.addcmul(-1, grad_avg, grad_avg).sqrt().add_(group['eps'])
else:
avg = square_avg.sqrt().add_(group['eps'])
if group['momentum'] > 0:
buf = state['momentum_buffer']
buf.mul_(group['momentum']).addcdiv_(grad, avg)
p.data.add_(-group['lr'], buf)
else:
p.data.addcdiv_(-group['lr'], grad, avg)
return loss
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adamax does not support sparse gradients')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p.data)
state['exp_inf'] = torch.zeros_like(p.data)
exp_avg, exp_inf = state['exp_avg'], state['exp_inf']
beta1, beta2 = group['betas']
eps = group['eps']
state['step'] += 1
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
# Update biased first moment estimate.
exp_avg.mul_(beta1).add_(1 - beta1, grad)
# Update the exponentially weighted infinity norm.
norm_buf = torch.cat([
exp_inf.mul_(beta2).unsqueeze(0),
grad.abs().add_(eps).unsqueeze_(0)
], 0)
torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
bias_correction = 1 - beta1 ** state['step']
clr = group['lr'] / bias_correction
p.data.addcdiv_(-clr, exp_avg, exp_inf)
return loss
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Rprop does not support sparse gradients')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
state['prev'] = torch.zeros_like(p.data)
state['step_size'] = grad.new().resize_as_(grad).fill_(group['lr'])
etaminus, etaplus = group['etas']
step_size_min, step_size_max = group['step_sizes']
step_size = state['step_size']
state['step'] += 1
sign = grad.mul(state['prev']).sign()
sign[sign.gt(0)] = etaplus
sign[sign.lt(0)] = etaminus
sign[sign.eq(0)] = 1
# update stepsizes with step size updates
step_size.mul_(sign).clamp_(step_size_min, step_size_max)
# for dir<0, dfdx=0
# for dir>=0 dfdx=dfdx
grad = grad.clone()
grad[sign.eq(etaminus)] = 0
# update parameters
p.data.addcmul_(-1, grad.sign(), step_size)
state['prev'].copy_(grad)
return loss
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('ASGD does not support sparse gradients')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
state['eta'] = group['lr']
state['mu'] = 1
state['ax'] = torch.zeros_like(p.data)
state['step'] += 1
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
# decay term
p.data.mul_(1 - group['lambd'] * state['eta'])
# update parameter
p.data.add_(-state['eta'], grad)
# averaging
if state['mu'] != 1:
state['ax'].add_(p.data.sub(state['ax']).mul(state['mu']))
else:
state['ax'].copy_(p.data)
# update eta and mu
state['eta'] = (group['lr'] /
math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha']))
state['mu'] = 1 / max(1, state['step'] - group['t0'])
return loss