def reset_parameters(self) -> None:
# Because we are doing so many torch.bmm calls, which is fast but unstable,
# it is critically important to intitialise the parameters correctly such
# that these matrix multiplications are well conditioned initially.
# Without this initialisation, this (non-deterministically) produces
# NaNs and overflows.
init.xavier_normal(self._query_projections)
init.xavier_normal(self._key_projections)
init.xavier_normal(self._value_projections)
评论列表
文章目录