seqdata.py 文件源码-python代码片段

def from_id2token(cls, sequences: Map[int, Seq[H]], id2token: Dict[H, int],
                      max_len: int, pack_sequences: bool=False,
                      append_eos: bool=True, eos_token: Opt[H]=DEFAULT_EOS,
                      null_token: H=DEFAULT_NULL, oov_token: H=DEFAULT_OOV,
                      int_id_type: str='long', shuffle: bool=True):
        """
        :param id2token: mapping of int ids to tokens
        :param max_len: maximum length of sequences to sample
        :param pack_sequences: bool indicating whether to return regular Tensors or PackedSequence instances.
        :param int_id_type: string indicating the type of int ids to use. Must be a key of data.str_to_int_tensor_type.
        :param oov_token: hashable to insert for out-of-vocab tokens when encoding
        :param eos_token: hashable to append to mark end-of-sequence in encoding
        :param null_token: hashable to use for padding sequences. Added to the vocab, unless none is passed
            and none is built, in which case this is considered to be an int id.
            Numpy aliases for integer types are valid, as well as 'long', 'short', 'byte', 'char'.
            The default 'long' is recommended, as only LongTensors can be used to index Embeddings in pytorch.
        """
        vocab = Vocabulary.from_id2token(id2token, oov_token=oov_token)
        encoder = SequenceTensorEncoder(vocab, append_eos=append_eos, eos_token=eos_token, null_token=null_token,
                                        int_id_type=int_id_type)
        return cls(sequences=sequences, encoder=encoder, max_len=max_len, pack_sequences=pack_sequences,
                   null_token=null_token, shuffle=shuffle)