vocabulary.py 文件源码-python代码片段

def from_dataset(cls,
                     dataset,
                     min_count: int = 1,
                     max_vocab_size: Union[int, Dict[str, int]] = None,
                     non_padded_namespaces: Sequence[str] = DEFAULT_NON_PADDED_NAMESPACES,
                     pretrained_files: Optional[Dict[str, str]] = None,
                     only_include_pretrained_words: bool = False) -> 'Vocabulary':
        """
        Constructs a vocabulary given a :class:`.Dataset` and some parameters.  We count all of the
        vocabulary items in the dataset, then pass those counts, and the other parameters, to
        :func:`__init__`.  See that method for a description of what the other parameters do.
        """
        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in tqdm.tqdm(dataset.instances):
            instance.count_vocab_items(namespace_token_counts)

        return Vocabulary(counter=namespace_token_counts,
                          min_count=min_count,
                          max_vocab_size=max_vocab_size,
                          non_padded_namespaces=non_padded_namespaces,
                          pretrained_files=pretrained_files,
                          only_include_pretrained_words=only_include_pretrained_words)