def tokenize_file(self, text_path, text_output_path='./tokenized_texts.txt'):
"""
???????? jieba ????
"""
# jieba custom setting.
jieba.set_dictionary(jieba_dictionary)
# load stopwords set
stopwordset = set()
with open(jieba_stopwords, 'r', encoding='utf-8') as sw:
for line in sw:
stopwordset.add(line.strip('\n'))
# ??
texts_num = 0
# ??????
output = open(text_output_path, 'w')
# ??????
with open(text_path, 'r') as content:
for line in content:
line = line.strip('\n')
# ??????
words = jieba.cut(line, cut_all=False)
for word in words:
if word not in stopwordset:
output.write(word + ' ')
output.write('\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("???? %d ????" % texts_num)
output.close()
chinese_text_processor.py 文件源码
python
阅读 17
收藏 0
点赞 0
评论 0
评论列表
文章目录