seqdata.py 文件源码-python代码片段

def from_vocab(cls, sequences: Map[int, Seq[H]], vocab: Vocabulary, max_len: int, pack_sequences: bool=False,
                   append_eos: bool=True, eos_token: Opt[H]=DEFAULT_EOS, null_token: H=DEFAULT_NULL,
                   int_id_type: str='long', shuffle: bool=True):
        """
        :param vocab: instance of Vocabulary to use for encoding/decoding tokens
        :param max_len: maximum length of sequences to sample
        :param pack_sequences: bool indicating whether to return regular Tensors or PackedSequence instances.
        :param int_id_type: string indicating the type of int ids to use. Must be a key of data.str_to_int_tensor_type.
        :param eos_token: string or hashable to append to mark end-of-sequence in encoding
        :param null_token: Optional hashable to use for padding sequences. Added to the vocab, unless none is passed
            and none is built, in which case this is considered to be an int id.
            Numpy aliases for integer types are valid, as well as 'long', 'short', 'byte', 'char'.
            The default 'long' is recommended, as only LongTensors can be used to index Embeddings in pytorch.
        """
        encoder = SequenceTensorEncoder(vocab, append_eos=append_eos, eos_token=eos_token, null_token=null_token,
                                        int_id_type=int_id_type)
        return cls(sequences=sequences, encoder=encoder, max_len=max_len, pack_sequences=pack_sequences,
                   null_token=null_token, shuffle=shuffle)