chinese_text_processor.py 文件源码-python代码片段

chinese_text_processor.py 文件源码

python

阅读 18 收藏 0 点赞 0 评论 0

项目：DataScience-And-MachineLearning-Handbook-For-Coders 作者: wxyyxc1992 项目源码文件源码

def tokenize_file(self, text_path, text_output_path='./tokenized_texts.txt'):
        """
        ???????? jieba ????
        """

        # jieba custom setting.
        jieba.set_dictionary(jieba_dictionary)

        # load stopwords set
        stopwordset = set()

        with open(jieba_stopwords, 'r', encoding='utf-8') as sw:
            for line in sw:
                stopwordset.add(line.strip('\n'))

        # ??
        texts_num = 0

        # ??????
        output = open(text_output_path, 'w')

        # ??????
        with open(text_path, 'r') as content:
            for line in content:
                line = line.strip('\n')

                # ??????
                words = jieba.cut(line, cut_all=False)
                for word in words:
                    if word not in stopwordset:
                        output.write(word + ' ')

                output.write('\n')

                texts_num += 1
                if texts_num % 10000 == 0:
                    logging.info("???? %d ????" % texts_num)
        output.close()