simpleDrQA.py 文件源码-python代码片段

def releventScore(self, text, ques, tfidf={}):
        def filtWord(li):
            # filt out stop words
            nl = []
            for l in li:
                if l not in STOPWORDS:
                    nl.append(l)
            return nl

        def sims(t, q):
            if t in self.dic.keys() and q in self.dic.keys():
                vector1 = self.dic[t]
                vector2 = self.dic[q]
                dot_product = 0.0
                normA = 0.0
                normB = 0.0
                for a, b in zip(vector1, vector2):
                    dot_product += a * b
                    normA += a**2
                    normB += b**2
                if normA == 0.0 or normB == 0.0:
                    return 0
                else:
                    return dot_product / ((normA * normB)**0.5)
            else:
                l = max([len(t), len(q)])
                if Levenshtein.distance(t, q) < l:
                    return (l - Levenshtein.distance(t, q)) / l * 0.7
                else:
                    return 0

        ttoks = filtWord(jieba.lcut_for_search(text))
        qtoks = filtWord(jieba.lcut_for_search(ques))

        score = 0
        if len(ttoks) == 0:
            return 0
        for tword in ttoks:
            for qword in qtoks:

                if tword in tfidf.keys():
                    rate = tfidf[tword]
                else:
                    rate = 1

                if tword == qword:
                    # exact match
                    score += rate * 2.5
                elif sims(tword, qword) > 0.4:
                    # similar
                    score += sims(tword, qword) * rate
        # remove advantage of length
        return score / len(ttoks) / len(qtoks) * 100