def merge_TFIDF(N, vocab_path, TF_DF_prefix, total_doc):
t0 = time.time()
TF = {}
DF = {}
total_TF = 1e-10
for i in range(N):
new_TF = pk.load(open(TF_DF_prefix+str(i)+'TF.pkl'))
new_DF = pk.load(open(TF_DF_prefix+str(i)+'DF.pkl'))
total_TF += merge_dict(TF, new_TF)
merge_dict(DF, new_DF)
t = time.time() - t0
print '%d / %d merged. time %fs' %(i+1, N, t)
pk.dump(TF, open(TF_DF_prefix+'TF.pkl', 'w'))
pk.dump(DF, open(TF_DF_prefix+'DF.pkl', 'w'))
TFIDF = TF.copy()
#for word, value in TFIDF.iteritems():
#TFIDF[word] = TF[word] * 1.0 / total_TF * np.log2(total_doc*1.0/DF[word])
save_vocab_txt(TFIDF, vocab_path+'_tfidf.txt')
merge_vocab.py 文件源码
python
阅读 17
收藏 0
点赞 0
评论 0
评论列表
文章目录