def reduce_tfidf(dictionary, weibo_test):
corpus_tfidf = None
# # # # ????? ????????tfidf
if not os.path.exists(path_tmp_tfidf):
print('=== ?????tfidf??????????tfidf?? ===')
# ?????????tfidf???????????????????
if not dictionary: # ????????????????????
dictionary = corpora.Dictionary.load(path_dictionary)
os.makedirs(path_tmp_tfidf)
files = os_path.LoadFiles(path_doc_root)
tfidf_model = models.TfidfModel(dictionary=dictionary)
corpus_tfidf = {}
for i, msg in enumerate(files):
catg = msg[0]
file = msg[1]
word_list = convert_doc_to_wordlist(file, cut_all=False)
file_bow = dictionary.doc2bow(word_list)
file_tfidf = tfidf_model[file_bow]
tmp = corpus_tfidf.get(catg, [])
tmp.append(file_tfidf)
if tmp.__len__() == 1:
corpus_tfidf[catg] = tmp
# ?tfidf????????
catgs = list(corpus_tfidf.keys())
for catg in catgs:
corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg),
corpus_tfidf.get(catg),
id2word=dictionary
)
print('catg {c} has been transformed into tfidf vector'.format(c=catg))
print('=== tfidf?????? ===')
else:
print('=== ???tfidf???????????? ===')
svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test)
评论列表
文章目录