def wiki2texts(self, wiki_data_path, wiki_texts_path='./wiki_texts.txt'):
"""
??????????????
Arguments:
wiki_data_path -- ????????
"""
if not wiki_data_path:
print("??? Wiki ?????????? https://dumps.wikimedia.org/zhwiki/ ??")
exit()
# ???????
wiki_corpus = WikiCorpus(wiki_data_path, dictionary={})
texts_num = 0
with open(wiki_text_path, 'w', encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("??? %d ???" % texts_num)
print("???????? OpenCC ??????")
wiki.py 文件源码
python
阅读 20
收藏 0
点赞 0
评论 0
评论列表
文章目录