def TFIDF_result():
str_handel_list = read_handel_list() # ??30?????????????????????str
str_test = read_test_list() # ?????????????????str
# ??TF-IDF???
corpus = str_handel_list[:] # TF-IDF????
corpus.append(str_test) # ????????????
print "TF-IDF corpus building success..."
######################### ??scikit-learn?? TF-IDF????
# ??????????????????????a[i][j] ??j??i???????
vectorizer = CountVectorizer()
# ??????????tf-idf??
transformer = TfidfTransformer()
# ???fit_transform???tf-idf????fit_transform??????????
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
# ????????????
word = vectorizer.get_feature_names()
# ?tf-idf?????????a[i][j]??j??i?????tf-idf??
weight = tfidf.toarray()
print "TF-IDF score is calcuated success..."
# ???30???????????TF-IDF??
results = []
for j in range(len(word)):
if word[j] == '??' or word[j] == '??' or len(word[j]) == 1: # ??????????1??
continue
results.append((word[j], weight[30][j])) # ??????????
sorted_results = sorted(results, key=lambda result: result[1], reverse=True) # ??????
# ?TF-IDF???100????
fp_tfidf_result = open("f://emotion/mysite/Label_extract/result_tfidf.txt", 'w+')
tfidf_results = []
for i in range(100): # ???????100??????????????
tfidf_results.append((sorted_results[i][0], sorted_results[i][1]))
fp_tfidf_result.write(sorted_results[i][0] + ' ' + str(round(sorted_results[i][1], 10)))
fp_tfidf_result.write('\n')
fp_tfidf_result.close()
return tfidf_results
评论列表
文章目录