def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
super(MultiHeadAttention, self).__init__()
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))
self.attention = ScaledDotProductAttention(d_model)
self.layer_norm = LayerNormalization(d_model)
self.proj = Linear(n_head*d_v, d_model)
self.dropout = nn.Dropout(dropout)
init.xavier_normal(self.w_qs)
init.xavier_normal(self.w_ks)
init.xavier_normal(self.w_vs)
python类xavier_normal()的实例源码
SubLayers.py 文件源码
项目:attention-is-all-you-need-pytorch
作者: jadore801120
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def test_xavier_normal(self):
for as_variable in [True, False]:
for use_gain in [True, False]:
for dims in [2, 4]:
input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25,
as_variable=as_variable)
gain = 1
if use_gain:
gain = self._random_float(0.1, 2)
init.xavier_normal(input_tensor, gain=gain)
else:
init.xavier_normal(input_tensor)
if as_variable:
input_tensor = input_tensor.data
fan_in = input_tensor.size(1)
fan_out = input_tensor.size(0)
if input_tensor.dim() > 2:
fan_in *= input_tensor[0, 0].numel()
fan_out *= input_tensor[0, 0].numel()
expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
assert self._is_normal(input_tensor, 0, expected_std)
def test_xavier_normal(self):
for as_variable in [True, False]:
for use_gain in [True, False]:
for dims in [2, 4]:
input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25,
as_variable=as_variable)
gain = 1
if use_gain:
gain = self._random_float(0.1, 2)
init.xavier_normal(input_tensor, gain=gain)
else:
init.xavier_normal(input_tensor)
if as_variable:
input_tensor = input_tensor.data
fan_in = input_tensor.size(1)
fan_out = input_tensor.size(0)
if input_tensor.dim() > 2:
fan_in *= input_tensor[0, 0].numel()
fan_out *= input_tensor[0, 0].numel()
expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
assert self._is_normal(input_tensor, 0, expected_std)
def test_xavier_normal(self):
for as_variable in [True, False]:
for use_gain in [True, False]:
for dims in [2, 4]:
input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25,
as_variable=as_variable)
gain = 1
if use_gain:
gain = self._random_float(0.1, 2)
init.xavier_normal(input_tensor, gain=gain)
else:
init.xavier_normal(input_tensor)
if as_variable:
input_tensor = input_tensor.data
fan_in = input_tensor.size(1)
fan_out = input_tensor.size(0)
if input_tensor.dim() > 2:
fan_in *= input_tensor[0, 0].numel()
fan_out *= input_tensor[0, 0].numel()
expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
assert self._is_normal(input_tensor, 0, expected_std)
def test_xavier_normal(self):
for as_variable in [True, False]:
for use_gain in [True, False]:
for dims in [2, 4]:
input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25,
as_variable=as_variable)
gain = 1
if use_gain:
gain = self._random_float(0.1, 2)
init.xavier_normal(input_tensor, gain=gain)
else:
init.xavier_normal(input_tensor)
if as_variable:
input_tensor = input_tensor.data
fan_in = input_tensor.size(1)
fan_out = input_tensor.size(0)
if input_tensor.dim() > 2:
fan_in *= input_tensor[0, 0].numel()
fan_out *= input_tensor[0, 0].numel()
expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
assert self._is_normal(input_tensor, 0, expected_std)
def reset_parameters(self) -> None:
# Because we are doing so many torch.bmm calls, which is fast but unstable,
# it is critically important to intitialise the parameters correctly such
# that these matrix multiplications are well conditioned initially.
# Without this initialisation, this (non-deterministically) produces
# NaNs and overflows.
init.xavier_normal(self._query_projections)
init.xavier_normal(self._key_projections)
init.xavier_normal(self._value_projections)
def _init_weight(self):
init.xavier_normal(self.w_qs)
init.xavier_normal(self.w_ks)
init.xavier_normal(self.w_vs)
init.xavier_normal(self.w_o.weight)
def _init_weight(self):
if self.share_linear:
self.linear.weight = self.dec.dec_ebd.weight
else:
init.xavier_normal(self.linear.weight)
def _init_weight(self):
init.xavier_normal(self._enc_mu.weight)
init.xavier_normal(self._enc_log_sigma.weight)
def _init_weight(self):
stdv = 1. / math.sqrt(self.hsz)
self.gate.weight.data.uniform_(-stdv, stdv)
self.gate.bias.data.fill_(-1)
if active.__name__ == "relu":
init.xavier_normal(self.h.weight)
else:
self.h.weight.data.uniform_(-stdv, stdv)
def _init_weight(self):
stdv = 1. / math.sqrt(self.hsz)
self.gate.weight.data.uniform_(-stdv, stdv)
self.gate.bias.data.fill_(-1)
if active.__name__ == "relu":
init.xavier_normal(self.h.weight)
else:
self.h.weight.data.uniform_(-stdv, stdv)
def reset_parameters(self):
I.normal(self.embeddings.weight.data, mean=0, std=0.01)
I.xavier_normal(self.W_i.weight.data)
I.xavier_normal(self.W_o.weight.data)
init_rnn_cell(self.encoder)
for i in range(self.n_decoders):
decoder = getattr(self, "decoder{}".format(i))
init_rnn_cell(decoder)
Modules.py 文件源码
项目:attention-is-all-you-need-pytorch
作者: jadore801120
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def __init__(self, d_in, d_out, bias=True):
super(Linear, self).__init__()
self.linear = nn.Linear(d_in, d_out, bias=bias)
init.xavier_normal(self.linear.weight)
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
nnInit.xavier_normal(m.weight)
if m.bias is not None:
m.bias.data.zero_()
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nnInit.xavier_normal(m.weight)
if m.bias is not None:
m.bias.data.zero_()
babi_main.py 文件源码
项目:Dynamic-memory-networks-plus-Pytorch
作者: dandelin
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def __init__(self, input_size, hidden_size):
super(AttentionGRUCell, self).__init__()
self.hidden_size = hidden_size
self.Wr = nn.Linear(input_size, hidden_size)
init.xavier_normal(self.Wr.state_dict()['weight'])
self.Ur = nn.Linear(hidden_size, hidden_size)
init.xavier_normal(self.Ur.state_dict()['weight'])
self.W = nn.Linear(input_size, hidden_size)
init.xavier_normal(self.W.state_dict()['weight'])
self.U = nn.Linear(hidden_size, hidden_size)
init.xavier_normal(self.U.state_dict()['weight'])
babi_main.py 文件源码
项目:Dynamic-memory-networks-plus-Pytorch
作者: dandelin
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def __init__(self, hidden_size):
super(EpisodicMemory, self).__init__()
self.AGRU = AttentionGRU(hidden_size, hidden_size)
self.z1 = nn.Linear(4 * hidden_size, hidden_size)
self.z2 = nn.Linear(hidden_size, 1)
self.next_mem = nn.Linear(3 * hidden_size, hidden_size)
init.xavier_normal(self.z1.state_dict()['weight'])
init.xavier_normal(self.z2.state_dict()['weight'])
init.xavier_normal(self.next_mem.state_dict()['weight'])
babi_main.py 文件源码
项目:Dynamic-memory-networks-plus-Pytorch
作者: dandelin
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def __init__(self, vocab_size, hidden_size):
super(InputModule, self).__init__()
self.hidden_size = hidden_size
self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True, batch_first=True)
for name, param in self.gru.state_dict().items():
if 'weight' in name: init.xavier_normal(param)
self.dropout = nn.Dropout(0.1)
babi_main.py 文件源码
项目:Dynamic-memory-networks-plus-Pytorch
作者: dandelin
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def __init__(self, vocab_size, hidden_size):
super(AnswerModule, self).__init__()
self.z = nn.Linear(2 * hidden_size, vocab_size)
init.xavier_normal(self.z.state_dict()['weight'])
self.dropout = nn.Dropout(0.1)
model_BiLSTM_1.py 文件源码
项目:cnn-lstm-bilstm-deepcnn-clstm-in-pytorch
作者: bamtercelboo
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def __init__(self, args):
super(BiLSTM_1, self).__init__()
self.args = args
self.hidden_dim = args.lstm_hidden_dim
self.num_layers = args.lstm_num_layers
V = args.embed_num
D = args.embed_dim
C = args.class_num
self.dropout = nn.Dropout(args.dropout)
self.dropout_embed = nn.Dropout(args.dropout_embed)
if args.max_norm is not None:
print("max_norm = {} ".format(args.max_norm))
self.embed = nn.Embedding(V, D, max_norm=args.max_norm, scale_grad_by_freq=True)
else:
print("max_norm = {} |||||".format(args.max_norm))
self.embed = nn.Embedding(V, D, scale_grad_by_freq=True)
if args.word_Embedding:
pretrained_weight = np.array(args.pretrained_weight)
self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight))
self.bilstm = nn.LSTM(D, self.hidden_dim, num_layers=self.num_layers, bias=True, bidirectional=True,
dropout=self.args.dropout)
print(self.bilstm)
if args.init_weight:
print("Initing W .......")
init.xavier_normal(self.bilstm.all_weights[0][0], gain=np.sqrt(args.init_weight_value))
init.xavier_normal(self.bilstm.all_weights[0][1], gain=np.sqrt(args.init_weight_value))
init.xavier_normal(self.bilstm.all_weights[1][0], gain=np.sqrt(args.init_weight_value))
init.xavier_normal(self.bilstm.all_weights[1][1], gain=np.sqrt(args.init_weight_value))
self.hidden2label = nn.Linear(self.hidden_dim * 2, C)
self.hidden = self.init_hidden(self.num_layers, args.batch_size)
print("self.hidden", self.hidden)
model_LSTM.py 文件源码
项目:cnn-lstm-bilstm-deepcnn-clstm-in-pytorch
作者: bamtercelboo
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def __init__(self, args):
super(LSTM, self).__init__()
self.args = args
# print(args)
self.hidden_dim = args.lstm_hidden_dim
self.num_layers = args.lstm_num_layers
V = args.embed_num
D = args.embed_dim
C = args.class_num
if args.max_norm is not None:
print("max_norm = {} ".format(args.max_norm))
self.embed = nn.Embedding(V, D, max_norm=args.max_norm, scale_grad_by_freq=True)
else:
print("max_norm = {} |||||".format(args.max_norm))
self.embed = nn.Embedding(V, D, scale_grad_by_freq=True)
# word embedding
if args.word_Embedding:
pretrained_weight = np.array(args.pretrained_weight)
self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight))
# lstm
self.lstm = nn.LSTM(D, self.hidden_dim, dropout=args.dropout, num_layers=self.num_layers)
if args.init_weight:
print("Initing W .......")
# n = self.lstm.input_size * self.lstm
init.xavier_normal(self.lstm.all_weights[0][0], gain=np.sqrt(args.init_weight_value))
init.xavier_normal(self.lstm.all_weights[0][1], gain=np.sqrt(args.init_weight_value))
# linear
self.hidden2label = nn.Linear(self.hidden_dim, C)
# hidden
self.hidden = self.init_hidden(self.num_layers, args.batch_size)
# dropout
self.dropout = nn.Dropout(args.dropout)
self.dropout_embed = nn.Dropout(args.dropout_embed)
def weight_init(m):
if isinstance(m, nn.Conv2d):
init.xavier_normal(m.weight)
init.constant(m.bias, 0)
def test_xavier_normal_errors_on_inputs_smaller_than_2d(self):
for as_variable in [True, False]:
for dims in [0, 1]:
tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1, as_variable=as_variable)
with self.assertRaises(ValueError):
init.xavier_normal(tensor)
def xavier_normal(w, gain=1):
return nn.xavier_normal(w, gain=gain)
def test_xavier_normal_errors_on_inputs_smaller_than_2d(self):
for as_variable in [True, False]:
for dims in [0, 1]:
tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1, as_variable=as_variable)
with self.assertRaises(ValueError):
init.xavier_normal(tensor)
def xavier_init(net):
'''Init layer parameters.'''
for m in net.modules():
if isinstance(m, nn.Conv2d):
init.xavier_normal(m.weight)
if m.bias is not None:
init.constant(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant(m.weight, 1)
init.constant(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal(m.weight, std=1e-3)
if m.bias is not None:
init.constant(m.bias, 0)
def initWeights(net, scheme='orthogonal'):
print('Initializing weights. Warning: may overwrite sensitive bias parameters (e.g. batchnorm)')
for e in net.parameters():
if scheme == 'orthogonal':
if len(e.size()) >= 2:
init.orthogonal(e)
elif scheme == 'normal':
init.normal(e, std=1e-2)
elif scheme == 'xavier':
init.xavier_normal(e)
def test_xavier_normal_errors_on_inputs_smaller_than_2d(self):
for as_variable in [True, False]:
for dims in [0, 1]:
tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1, as_variable=as_variable)
with self.assertRaises(ValueError):
init.xavier_normal(tensor)
def weights_init_xavier(m):
classname = m.__class__.__name__
# print(classname)
if classname.find('Conv') != -1:
init.xavier_normal(m.weight.data, gain=0.02)
elif classname.find('Linear') != -1:
init.xavier_normal(m.weight.data, gain=0.02)
elif classname.find('BatchNorm2d') != -1:
init.normal(m.weight.data, 1.0, 0.02)
init.constant(m.bias.data, 0.0)
def test_xavier_normal_errors_on_inputs_smaller_than_2d(self):
for as_variable in [True, False]:
for dims in [0, 1]:
tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1, as_variable=as_variable)
with self.assertRaises(ValueError):
init.xavier_normal(tensor)