neural.py 文件源码

python
阅读 28 收藏 0 点赞 0 评论 0

项目:NeuralMT 作者: hlt-mt 项目源码 文件源码
def _prepare_corpora(self, corpora, bpe_encoder, src_vocab, trg_vocab):
        src, trg = [], []
        sizes = []
        count, ignored = 0, 0

        for corpus in corpora:
            with corpus.reader([self._source_lang, self._target_lang]) as reader:
                for source, target in reader:
                    src_words = bpe_encoder.encode_line(source, is_source=True)
                    trg_words = bpe_encoder.encode_line(target, is_source=False)

                    if len(src_words) > 0 and len(trg_words) > 0:
                        src.append(src_vocab.convertToIdx(src_words,
                                                          onmt.Constants.UNK_WORD))
                        trg.append(trg_vocab.convertToIdx(trg_words,
                                                          onmt.Constants.UNK_WORD,
                                                          onmt.Constants.BOS_WORD,
                                                          onmt.Constants.EOS_WORD))
                        sizes.append(len(src_words))
                    else:
                        ignored += 1

                    count += 1
                    if count % 100000 == 0:
                        self._logger.info(' %d sentences prepared' % count)

        self._logger.info('Shuffling sentences')
        perm = torch.randperm(len(src))
        src = [src[idx] for idx in perm]
        trg = [trg[idx] for idx in perm]
        sizes = [sizes[idx] for idx in perm]

        self._logger.info('Sorting sentences by size')
        _, perm = torch.sort(torch.Tensor(sizes))
        src = [src[idx] for idx in perm]
        trg = [trg[idx] for idx in perm]

        self._logger.info('Prepared %d sentences (%d ignored due to length == 0)' % (len(src), ignored))

        return src, trg
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号