def get_wmd_distance(d1, d2, min_vocab=7, verbose=False):
vocabulary = [w for w in set(d1.lower().split() + d2.lower().split()) if w in model.vocab and w not in stop_words.ENGLISH_STOP_WORDS]
if len(vocabulary) < min_vocab:
return 1
vect = CountVectorizer(vocabulary=vocabulary).fit([d1, d2])
W_ = np.array([model[w] for w in vect.get_feature_names() if w in model])
D_ = euclidean_distances(W_)
D_ = D_.astype(np.double)
D_ /= D_.max() # just for comparison purposes
v_1, v_2 = vect.transform([d1, d2])
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()
# pyemd needs double precision input
v_1 = v_1.astype(np.double)
v_2 = v_2.astype(np.double)
v_1 /= v_1.sum()
v_2 /= v_2.sum()
if verbose:
print vocabulary
print v_1, v_2
return emd(v_1, v_2, D_)
# d1 = "Government speaks to the media in Illinois"
# d2 = "The state addresses the press in Chicago"
# print get_wmd_distance(d1, d2)
评论列表
文章目录