def __init__(self,
source_lang: str, target_lang: str,
key: Any=None,
vocabulary: FrozenSet[str]=None,
observations: int=None,
validate: bool=False,
name: str='unamed',
tqdm: bool=True,
**kwargs) -> None:
# save basic properties
self._show_tqdm = tqdm
# compute properties
computed_properties = self._fetch_corpus_properties(
name, key, observations=observations
)
# use computed vocabulary if not provided
if vocabulary is None:
vocabulary = computed_properties.vocabulary
# make properties public
self.properties = CorpusProperties(
vocabulary=vocabulary,
histogram=computed_properties.histogram
)
# validate properties
# http://unicode-search.net/unicode-namesearch.pl
if '^' in vocabulary:
raise ValueError('a eos char (^) was found in the vocabulary')
if '?' in vocabulary:
raise ValueError('a null char (?) was found in the vocabulary')
if '' in vocabulary:
raise ValueError('an invalid char () was found in the vocabulary')
# create encoding schema
self._setup_encoding()
# set language properties
self.source_lang = source_lang
self.target_lang = target_lang
# validate corpus properties
if validate:
self._validate_corpus_properties(name)
# setup tensorflow pipeline
super().__init__(histogram=self.properties.histogram,
dtype=self._encode_dtype,
name=name,
tqdm=tqdm,
**kwargs)
评论列表
文章目录