def centroid_similarity(X, internal_ids, nn_metric='cosine'):
""" Given a list of documents in a cluster, compute the cluster centroid,
intertia and individual distances
Parameters
----------
internal_ids : list
a list of internal ids
nn_metric : str
a rescaling of the metric if needed
"""
from ..metrics import _scale_cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
X_sl = X[internal_ids, :]
centroid = X_sl.mean(axis=0)
if centroid.ndim == 1:
centroid = centroid[None, :]
S_cos = 1 - pairwise_distances(X_sl, centroid, metric='cosine')
S_sim = _scale_cosine_similarity(S_cos, metric=nn_metric)
S_sim_mean = np.mean(S_sim)
return float(S_sim_mean), S_sim[:, 0]
评论列表
文章目录