def from_vocab(cls, sequences: Map[int, Seq[H]], vocab: Vocabulary, max_len: int, pack_sequences: bool=False,
append_eos: bool=True, eos_token: Opt[H]=DEFAULT_EOS, null_token: H=DEFAULT_NULL,
int_id_type: str='long', shuffle: bool=True):
"""
:param vocab: instance of Vocabulary to use for encoding/decoding tokens
:param max_len: maximum length of sequences to sample
:param pack_sequences: bool indicating whether to return regular Tensors or PackedSequence instances.
:param int_id_type: string indicating the type of int ids to use. Must be a key of data.str_to_int_tensor_type.
:param eos_token: string or hashable to append to mark end-of-sequence in encoding
:param null_token: Optional hashable to use for padding sequences. Added to the vocab, unless none is passed
and none is built, in which case this is considered to be an int id.
Numpy aliases for integer types are valid, as well as 'long', 'short', 'byte', 'char'.
The default 'long' is recommended, as only LongTensors can be used to index Embeddings in pytorch.
"""
encoder = SequenceTensorEncoder(vocab, append_eos=append_eos, eos_token=eos_token, null_token=null_token,
int_id_type=int_id_type)
return cls(sequences=sequences, encoder=encoder, max_len=max_len, pack_sequences=pack_sequences,
null_token=null_token, shuffle=shuffle)
评论列表
文章目录