def getSignificantItems(item_list):
tokenised_list = []
logging.info('Tokenising input data.')
for item in item_list:
tokenised_list.append(tokeniseUrl(item))
items = np.asarray(item_list)
tokenised_items = np.asarray(tokenised_list)
logging.info('Calculating Levenshtein distances between items.')
lev_similarity = -1*np.array([[Levenshtein.distance(i1,i2) for i1 in tokenised_items] for i2 in tokenised_items])
logging.info('Applying affinity propagation to data.')
aff_prop = sklearn.cluster.AffinityPropagation(affinity='precomputed', damping=0.7)
aff_prop.fit(lev_similarity)
logging.info('Completed! Assembling list.')
output_list = []
for cluster_id in np.unique(aff_prop.labels_):
exemplar = items[aff_prop.cluster_centers_indices_[cluster_id]]
output_list.append(exemplar)
return output_list
评论列表
文章目录