def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
评论列表
文章目录