def similarity(c1, c2):
'''stop words are words like "it" and "the" , that have no massive impact on the
sentence'''
stop_words = list(stopwords.words("english"))
# Removes stop words in both sentences
c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
c1_words = Counter(dedupe(c1_cleaned))
c2_words = Counter(dedupe(c2_cleaned))
total_words = c1_words + c2_words
similarity_between_words = 0
for key, val in total_words.items():
''' Looks at whether the two articles share a word'''
if total_words[key] > 1:
similarity_between_words += 1
return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))
评论列表
文章目录