def wikiToTxt(self):
# This function takes about 25 minutes
from gensim.corpora import WikiCorpus
wiki_corpus = WikiCorpus('./build/zhwiki-latest-pages-articles.xml.bz2', dictionary={})
texts_num = 0
with open('./build/wiki_texts.txt', 'w', encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("??? %d ???" % texts_num)
评论列表
文章目录