def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
super(MultiHeadAttention, self).__init__()
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))
self.attention = ScaledDotProductAttention(d_model)
self.layer_norm = LayerNormalization(d_model)
self.proj = Linear(n_head*d_v, d_model)
self.dropout = nn.Dropout(dropout)
init.xavier_normal(self.w_qs)
init.xavier_normal(self.w_ks)
init.xavier_normal(self.w_vs)
SubLayers.py 文件源码
python
阅读 25
收藏 0
点赞 0
评论 0
评论列表
文章目录