def __iter__(self) -> Iterator[Tuple[str, str]]:
with NLTKEnv() as nltk_env:
nltk_env.download('perluniprops')
nltk_env.download('comtrans')
from nltk.corpus import comtrans
from nltk.tokenize.moses import MosesDetokenizer
als = comtrans.aligned_sents(self._comtrans_string())
source_detokenizer = MosesDetokenizer(lang=self._source_lang)
target_detokenizer = MosesDetokenizer(lang=self._target_lang)
for source, target in self._comtrans_maybe_swap(als):
source = source_detokenizer.detokenize(source, return_str=True)
target = target_detokenizer.detokenize(target, return_str=True)
if self._length_checker(source, target):
yield (source, target)
评论列表
文章目录