def close_words(W, X, labels, top_n=6):
'''
Find words that are close to each label.
W is a gensim.word2vec
X is the document vectors.
labels are predetermined cluster labels.
'''
L = []
for label in np.unique(labels):
label_idx = labels == label
mu = X[label_idx].mean(axis=0)
dist = W.wv.syn0.dot(mu)
idx = np.argsort(dist)[::-1][:top_n]
words = [W.wv.index2word[i] for i in idx]
L.append(' '.join(words))
# Map unicode to simple ASCII
L = map(unidecode, L)
# Remove _PHRASE
L = map(lambda x: x.replace('PHRASE_', ''), L)
return L
评论列表
文章目录