def idf(tf_dic_list,global_idf_dic,silent=1):
"""
Input:
global_idf_dic = {} # word -> idf, which may be updated in place
"""
if silent==0:
print("idf ...")
doc_len = len(tf_dic_list)
idf_dic_list = [] # [{word:idf} for each sample]
for c,tf_dic in enumerate(tf_dic_list):
idf_dic = {}
for word in tf_dic:
if word not in global_idf_dic:
n_containing = sum([word in tf_dic for tf_dic in tf_dic_list])
global_idf_dic[word] = log(doc_len/(1.0+n_containing))
idf_dic[word] = global_idf_dic[word]
idf_dic_list.append(idf_dic)
if silent == 0 and c>0 and c%100 == 0:
print("{} documents done, total {}, word {}, idf {}".format(c,len(tf_dic_list),word,global_idf_dic[word]))
return idf_dic_list
评论列表
文章目录