wiki_to_txt.py 文件源码

python
阅读 17 收藏 0 点赞 0 评论 0

项目:Word2vec 作者: Alex-CHUN-YU 项目源码 文件源码
def set_wiki_to_txt(self, wiki_data_path = None):
        if wiki_data_path == None:
            # ?????
            if len(sys.argv) != 2:
                print("Please Usage: python3 " + sys.argv[0] + " wiki_data_path")
                exit()
            else:
                wiki_corpus = WikiCorpus(sys.argv[1], dictionary = {})
        else:
            wiki_corpus = WikiCorpus(wiki_data_path, dictionary = {})
        # wiki.xml convert to wiki.txt
        with open("wiki_text.txt", 'w', encoding = 'utf-8') as output:
            text_count = 0
            for text in wiki_corpus.get_texts():
                # save use byte and decode utf-8
                output.write(b' '.join(text).decode('utf-8') + '\n')
                text_count += 1
                if text_count % 10000 == 0:
                    logging.info("????? %d ???" % text_count)
            print("????!")
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号