def compute_similarity(self, doc1, doc2):
"""
Calculates the similarity between two spaCy documents. Extracts the
nBOW from them and evaluates the WMD.
:return: The calculated similarity.
:rtype: float.
"""
doc1 = self._convert_document(doc1)
doc2 = self._convert_document(doc2)
vocabulary = {
w: i for i, w in enumerate(sorted(set(doc1).union(doc2)))}
w1 = self._generate_weights(doc1, vocabulary)
w2 = self._generate_weights(doc2, vocabulary)
evec = numpy.zeros((len(vocabulary), self.nlp.vocab.vectors_length),
dtype=numpy.float32)
for w, i in vocabulary.items():
evec[i] = self.nlp.vocab[w].vector
evec_sqr = (evec * evec).sum(axis=1)
dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, numpy.newaxis]
dists[dists < 0] = 0
dists = numpy.sqrt(dists)
return libwmdrelax.emd(w1, w2, dists)
评论列表
文章目录