def _compute_author_similarity(self, paired_authors):
def row_similarity(row):
same_email = row.author_email == row.author_email_other
name_similarity = fuzz.token_set_ratio(row.author_name,
row.author_name_other)
email_name_similarity = fuzz.ratio(row.email_name,
row.email_name_other)
name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
row.name_from_email_other)
return pd.Series(
[same_email, name_similarity, email_name_similarity,
name_to_email_similarity])
newcols = paired_authors.apply(row_similarity, axis=1)
newcols.columns = ['same_email', 'name_similarity',
'email_name_similarity', 'name_to_email_similarity']
newdf = paired_authors.join(newcols)
return newdf
评论列表
文章目录