def distortion_score(X, labels, metric='euclidean'):
"""
Compute the mean distortion of all samples.
The distortion is computed as the the sum of the squared distances between
each observation and its closest centroid. Logically, this is the metric
that K-Means attempts to minimize as it is fitting the model.
.. seealso:: http://kldavenport.com/the-cost-function-of-k-means/
Parameters
----------
X : array, shape = [n_samples, n_features] or [n_samples_a, n_samples_a]
Array of pairwise distances between samples if metric == "precomputed"
or a feature array for computing distances against the labels.
labels : array, shape = [n_samples]
Predicted labels for each sample
metric : string
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by `sklearn.metrics.pairwise.pairwise_distances
<http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html#sklearn.metrics.pairwise.pairwise_distances>`_
.. todo:: add sample_size and random_state kwds similar to silhouette_score
"""
# Encode labels to get unique centers and groups
le = LabelEncoder()
labels = le.fit_transform(labels)
unique_labels = le.classes_
# Sum of the distortions
distortion = 0
# Loop through each label (center) to compute the centroid
for current_label in unique_labels:
# Mask the instances that belong to the current label
mask = labels == current_label
instances = X[mask]
# Compute the center of these instances
center = instances.mean(axis=0)
# Compute the square distances from the instances to the center
distances = pairwise_distances(instances, [center], metric=metric)
distances = distances ** 2
# Add the mean square distance to the distortion
distortion += distances.mean()
return distortion
##########################################################################
## Elbow Method
##########################################################################
评论列表
文章目录