def releventScore(self, text, ques, tfidf={}):
def filtWord(li):
# filt out stop words
nl = []
for l in li:
if l not in STOPWORDS:
nl.append(l)
return nl
def sims(t, q):
if t in self.dic.keys() and q in self.dic.keys():
vector1 = self.dic[t]
vector2 = self.dic[q]
dot_product = 0.0
normA = 0.0
normB = 0.0
for a, b in zip(vector1, vector2):
dot_product += a * b
normA += a**2
normB += b**2
if normA == 0.0 or normB == 0.0:
return 0
else:
return dot_product / ((normA * normB)**0.5)
else:
l = max([len(t), len(q)])
if Levenshtein.distance(t, q) < l:
return (l - Levenshtein.distance(t, q)) / l * 0.7
else:
return 0
ttoks = filtWord(jieba.lcut_for_search(text))
qtoks = filtWord(jieba.lcut_for_search(ques))
score = 0
if len(ttoks) == 0:
return 0
for tword in ttoks:
for qword in qtoks:
if tword in tfidf.keys():
rate = tfidf[tword]
else:
rate = 1
if tword == qword:
# exact match
score += rate * 2.5
elif sims(tword, qword) > 0.4:
# similar
score += sims(tword, qword) * rate
# remove advantage of length
return score / len(ttoks) / len(qtoks) * 100
评论列表
文章目录