def find_n_most_similar_articles(self):
"""
Find the n most similar articles with the highest similarity score for each TMT article in the DataFrame.
:return:
"""
# Iterate over each article in DataFrame
for index, row in self.df_article_vectors.iterrows():
# Get the similarity scores of the current article compared to all other articles
similarity_scores = self.similarity_score_dict[index]
# Find the highest similarity scores in the similarity_score_dict until we have found the n most similar.
for i in range(0, self.n_most_similar):
# Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min!
most_similar_article_index = max(similarity_scores, key=similarity_scores.get)
most_similar_article_score = similarity_scores[most_similar_article_index]
del similarity_scores[most_similar_article_index]
# Find corresponding title and set it as most similar article i in DataFrame
title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8')
title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score)
self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)
评论列表
文章目录