__init__.py 文件源码-python代码片段

__init__.py 文件源码
python
阅读 81 收藏 0 点赞 0 评论 0
def compute_similarity(self, doc1, doc2):
            """
            Calculates the similarity between two spaCy documents. Extracts the
            nBOW from them and evaluates the WMD.

            :return: The calculated similarity.
            :rtype: float.
            """
            doc1 = self._convert_document(doc1)
            doc2 = self._convert_document(doc2)
            vocabulary = {
                w: i for i, w in enumerate(sorted(set(doc1).union(doc2)))}
            w1 = self._generate_weights(doc1, vocabulary)
            w2 = self._generate_weights(doc2, vocabulary)
            evec = numpy.zeros((len(vocabulary), self.nlp.vocab.vectors_length),
                               dtype=numpy.float32)
            for w, i in vocabulary.items():
                evec[i] = self.nlp.vocab[w].vector
            evec_sqr = (evec * evec).sum(axis=1)
            dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, numpy.newaxis]
            dists[dists < 0] = 0
            dists = numpy.sqrt(dists)
            return libwmdrelax.emd(w1, w2, dists)