wmd.py 文件源码-python代码片段

wmd.py 文件源码

python

阅读 30 收藏 0 点赞 0 评论 0

项目：Word_Mover_Distance 作者: PragmaticLab 项目源码文件源码

def get_wmd_distance(d1, d2, min_vocab=7, verbose=False):
    vocabulary = [w for w in set(d1.lower().split() + d2.lower().split()) if w in model.vocab and w not in stop_words.ENGLISH_STOP_WORDS]
    if len(vocabulary) < min_vocab:
        return 1
    vect = CountVectorizer(vocabulary=vocabulary).fit([d1, d2])
    W_ = np.array([model[w] for w in vect.get_feature_names() if w in model])
    D_ = euclidean_distances(W_)
    D_ = D_.astype(np.double)
    D_ /= D_.max()  # just for comparison purposes
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    # pyemd needs double precision input
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    if verbose:
        print vocabulary
        print v_1, v_2
    return emd(v_1, v_2, D_)

# d1 = "Government speaks to the media in Illinois"
# d2 = "The state addresses the press in Chicago"
# print get_wmd_distance(d1, d2)