def set_wiki_to_txt(self, wiki_data_path = None):
if wiki_data_path == None:
# ?????
if len(sys.argv) != 2:
print("Please Usage: python3 " + sys.argv[0] + " wiki_data_path")
exit()
else:
wiki_corpus = WikiCorpus(sys.argv[1], dictionary = {})
else:
wiki_corpus = WikiCorpus(wiki_data_path, dictionary = {})
# wiki.xml convert to wiki.txt
with open("wiki_text.txt", 'w', encoding = 'utf-8') as output:
text_count = 0
for text in wiki_corpus.get_texts():
# save use byte and decode utf-8
output.write(b' '.join(text).decode('utf-8') + '\n')
text_count += 1
if text_count % 10000 == 0:
logging.info("????? %d ???" % text_count)
print("????!")
评论列表
文章目录