cluster.py 文件源码-python代码片段

cluster.py 文件源码

python

阅读 23 收藏 0 点赞 0 评论 0

项目：word2vec_pipeline 作者: NIHOPA 项目源码文件源码

def close_words(W, X, labels, top_n=6):
    '''
    Find words that are close to each label.
    W is a gensim.word2vec
    X is the document vectors.
    labels are predetermined cluster labels.
    '''

    L = []
    for label in np.unique(labels):
        label_idx = labels == label
        mu = X[label_idx].mean(axis=0)

        dist = W.wv.syn0.dot(mu)
        idx = np.argsort(dist)[::-1][:top_n]
        words = [W.wv.index2word[i] for i in idx]
        L.append(' '.join(words))

    # Map unicode to simple ASCII
    L = map(unidecode, L)

    # Remove _PHRASE
    L = map(lambda x: x.replace('PHRASE_', ''), L)

    return L