def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
"""????????????
Args:
news_folder/
??/
??/
??/
"""
news_classes = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
stopwords = get_stopwords(stopwords_file)
feature_words_dict = {}
# ??????????
jieba.enable_parallel(4)
for news_class in news_classes:
subfolder = os.path.join(news_folder, news_class)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
for word in word_list:
if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
if word in feature_words_dict:
feature_words_dict[word] += 1
else:
feature_words_dict[word] = 1
jieba.disable_parallel()
feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
feature_words = list(list(zip(*feature_words_tuple))[0])
return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
Multinomial_NaiveBayes_News_Classifier.py 文件源码
python
阅读 25
收藏 0
点赞 0
评论 0
评论列表
文章目录