def _prepare_corpora(self, corpora, bpe_encoder, src_vocab, trg_vocab):
src, trg = [], []
sizes = []
count, ignored = 0, 0
for corpus in corpora:
with corpus.reader([self._source_lang, self._target_lang]) as reader:
for source, target in reader:
src_words = bpe_encoder.encode_line(source, is_source=True)
trg_words = bpe_encoder.encode_line(target, is_source=False)
if len(src_words) > 0 and len(trg_words) > 0:
src.append(src_vocab.convertToIdx(src_words,
onmt.Constants.UNK_WORD))
trg.append(trg_vocab.convertToIdx(trg_words,
onmt.Constants.UNK_WORD,
onmt.Constants.BOS_WORD,
onmt.Constants.EOS_WORD))
sizes.append(len(src_words))
else:
ignored += 1
count += 1
if count % 100000 == 0:
self._logger.info(' %d sentences prepared' % count)
self._logger.info('Shuffling sentences')
perm = torch.randperm(len(src))
src = [src[idx] for idx in perm]
trg = [trg[idx] for idx in perm]
sizes = [sizes[idx] for idx in perm]
self._logger.info('Sorting sentences by size')
_, perm = torch.sort(torch.Tensor(sizes))
src = [src[idx] for idx in perm]
trg = [trg[idx] for idx in perm]
self._logger.info('Prepared %d sentences (%d ignored due to length == 0)' % (len(src), ignored))
return src, trg
评论列表
文章目录