def gen_idf_file(self):
files = listdir(self.doc_dir_path)
n = float(len(files))
idf = {}
for i in files:
root = ET.parse(self.doc_dir_path + i).getroot()
title = root.find('title').text
body = root.find('body').text
seg_list = jieba.lcut(title + '?' + body, cut_all=False)
seg_list = set(seg_list) - self.stop_words
for word in seg_list:
word = word.strip().lower()
if word == '' or self.is_number(word):
continue
if word not in idf:
idf[word] = 1
else:
idf[word] = idf[word] + 1
idf_file = open(self.idf_path, 'w', encoding = 'utf-8')
for word, df in idf.items():
idf_file.write('%s %.9f\n'%(word, math.log(n / df)))
idf_file.close()
评论列表
文章目录