def get_probability(news_folder, feature_words):
"""????, prob_matrix, prob_classes
Args:
news_folder/
??/
??/
??/
"""
news_classes = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
prob_matrix = pd.DataFrame(index=feature_words, columns=news_classes)
num_of_all_news = 0
prob_classes = {}
for cls in news_classes:
prob_classes[cls] = 0
# ??????????
jieba.enable_parallel(4)
for news_class in news_classes:
prob_count = {}
for word in feature_words:
prob_count[word] = 1 # ??????
subfolder = os.path.join(news_folder, news_class)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
for word in prob_count.keys():
if word in word_list:
prob_count[word] += 1
news_nums = len(news_list)
num_of_all_news += news_nums
prob_classes[news_class] = news_nums
for word in prob_count.keys():
prob_matrix.loc[word, news_class] = prob_count[word]/(news_nums + 2)# ??????
jieba.disable_parallel()
for cls in prob_classes.keys():
prob_classes[cls] = prob_classes[cls] / num_of_all_news
return prob_matrix, prob_classes
Bernoulli_NaiveBayes_News_Classifier.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录