def get_topic_idf(self, sentences):
vectorizer = CountVectorizer()
sent_word_matrix = vectorizer.fit_transform(sentences)
transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
tfidf = transformer.fit_transform(sent_word_matrix)
tfidf = tfidf.toarray()
centroid_vector = tfidf.sum(0)
centroid_vector = np.divide(centroid_vector, centroid_vector.max())
# print(centroid_vector.max())
feature_names = vectorizer.get_feature_names()
word_list = []
for i in range(centroid_vector.shape[0]):
if centroid_vector[i] > self.topic_threshold:
# print(feature_names[i], centroid_vector[i])
word_list.append(feature_names[i])
return word_list
评论列表
文章目录