def from_dataset(cls,
dataset,
min_count: int = 1,
max_vocab_size: Union[int, Dict[str, int]] = None,
non_padded_namespaces: Sequence[str] = DEFAULT_NON_PADDED_NAMESPACES,
pretrained_files: Optional[Dict[str, str]] = None,
only_include_pretrained_words: bool = False) -> 'Vocabulary':
"""
Constructs a vocabulary given a :class:`.Dataset` and some parameters. We count all of the
vocabulary items in the dataset, then pass those counts, and the other parameters, to
:func:`__init__`. See that method for a description of what the other parameters do.
"""
logger.info("Fitting token dictionary from dataset.")
namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
for instance in tqdm.tqdm(dataset.instances):
instance.count_vocab_items(namespace_token_counts)
return Vocabulary(counter=namespace_token_counts,
min_count=min_count,
max_vocab_size=max_vocab_size,
non_padded_namespaces=non_padded_namespaces,
pretrained_files=pretrained_files,
only_include_pretrained_words=only_include_pretrained_words)
评论列表
文章目录