def get_word_clouds(tweets, users, words_n=50, lang='english'):
default_stopwords = set(nltk.corpus.stopwords.words(lang))
stopwords_file = '../data/stopwords.txt'
custom_stopwords = set(open(stopwords_file, 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=list(all_stopwords))
X = vectorizer.fit_transform(tweets)
terms = vectorizer.get_feature_names()
word_cloud_per_person = {}
for doc in range(len(tweets)):
feature_index = X[doc, :].nonzero()[1]
tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
doc_terms = []
for word, score in [(terms[i], score) for (i, score) in tfidf_scores]:
doc_terms.append((word, score))
important_terms = [(word, score) for word, score in sorted(doc_terms, key=lambda x: x[1], reverse=True)][:words_n]
word_cloud_per_person[users[doc]] = important_terms
return word_cloud_per_person
评论列表
文章目录