text_dataset.py 文件源码-python代码片段

def __init__(self,
                 source_lang: str, target_lang: str,
                 key: Any=None,
                 vocabulary: FrozenSet[str]=None,
                 observations: int=None,
                 validate: bool=False,
                 name: str='unamed',
                 tqdm: bool=True,
                 **kwargs) -> None:
        # save basic properties
        self._show_tqdm = tqdm

        # compute properties
        computed_properties = self._fetch_corpus_properties(
            name, key, observations=observations
        )
        # use computed vocabulary if not provided
        if vocabulary is None:
            vocabulary = computed_properties.vocabulary

        # make properties public
        self.properties = CorpusProperties(
            vocabulary=vocabulary,
            histogram=computed_properties.histogram
        )

        # validate properties
        # http://unicode-search.net/unicode-namesearch.pl
        if '^' in vocabulary:
            raise ValueError('a eos char (^) was found in the vocabulary')
        if '?' in vocabulary:
            raise ValueError('a null char (?) was found in the vocabulary')
        if '' in vocabulary:
            raise ValueError('an invalid char () was found in the vocabulary')

        # create encoding schema
        self._setup_encoding()

        # set language properties
        self.source_lang = source_lang
        self.target_lang = target_lang

        # validate corpus properties
        if validate:
            self._validate_corpus_properties(name)

        # setup tensorflow pipeline
        super().__init__(histogram=self.properties.histogram,
                         dtype=self._encode_dtype,
                         name=name,
                         tqdm=tqdm,
                         **kwargs)