seqdata.py 文件源码

python
阅读 16 收藏 0 点赞 0 评论 0

项目:sk-torch 作者: mattHawthorn 项目源码 文件源码
def from_id2token(cls, sequences: Map[int, Seq[H]], id2token: Dict[H, int],
                      max_len: int, pack_sequences: bool=False,
                      append_eos: bool=True, eos_token: Opt[H]=DEFAULT_EOS,
                      null_token: H=DEFAULT_NULL, oov_token: H=DEFAULT_OOV,
                      int_id_type: str='long', shuffle: bool=True):
        """
        :param id2token: mapping of int ids to tokens
        :param max_len: maximum length of sequences to sample
        :param pack_sequences: bool indicating whether to return regular Tensors or PackedSequence instances.
        :param int_id_type: string indicating the type of int ids to use. Must be a key of data.str_to_int_tensor_type.
        :param oov_token: hashable to insert for out-of-vocab tokens when encoding
        :param eos_token: hashable to append to mark end-of-sequence in encoding
        :param null_token: hashable to use for padding sequences. Added to the vocab, unless none is passed
            and none is built, in which case this is considered to be an int id.
            Numpy aliases for integer types are valid, as well as 'long', 'short', 'byte', 'char'.
            The default 'long' is recommended, as only LongTensors can be used to index Embeddings in pytorch.
        """
        vocab = Vocabulary.from_id2token(id2token, oov_token=oov_token)
        encoder = SequenceTensorEncoder(vocab, append_eos=append_eos, eos_token=eos_token, null_token=null_token,
                                        int_id_type=int_id_type)
        return cls(sequences=sequences, encoder=encoder, max_len=max_len, pack_sequences=pack_sequences,
                   null_token=null_token, shuffle=shuffle)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号