def filter_hits_by_distance(hits, source_text,
min_similarity=DEFAULT_MIN_SIMILARITY):
"""Returns ES `hits` filtered according to their Levenshtein distance
to the `source_text`.
Any hits with a similarity value (0..1) lower than `min_similarity` will be
discarded. It's assumed that `hits` is already sorted from higher to lower
score.
"""
if min_similarity <= 0 or min_similarity >= 1:
min_similarity = DEFAULT_MIN_SIMILARITY
filtered_hits = []
for hit in hits:
hit_source_text = hit['_source']['source']
distance = Levenshtein.distance(source_text, hit_source_text)
similarity = (
1 - distance / float(max(len(source_text), len(hit_source_text)))
)
logger.debug(
'Similarity: %.2f (distance: %d)\nOriginal:\t%s\nComparing with:\t%s',
similarity, distance, source_text, hit_source_text
)
if similarity < min_similarity:
break
filtered_hits.append(hit)
return filtered_hits
评论列表
文章目录