def generateEvaluation(self, output_dir, assigned_clusters, quick = False):
if quick:
self.silhouette_avg = 0
return
if self.distances is not None:
self.sample_silhouette_values = silhouette_samples(
self.distances, assigned_clusters,
metric = 'precomputed')
else:
self.sample_silhouette_values = silhouette_samples(self.instances.getFeatures(),
assigned_clusters)
self.silhouette_avg = np.mean(self.sample_silhouette_values)
self.printSilhouette(output_dir, assigned_clusters)
# Code from a scikit-learn example:
# Selecting the number of clusters with silhouette analysis on KMeans clustering
python类silhouette_samples()的实例源码
def fit(self, X, y=None, **kwargs):
"""
Fits the model and generates the the silhouette visualization.
TODO: decide to use this method or the score method to draw.
NOTE: Probably this would be better in score, but the standard score
is a little different and I'm not sure how it's used.
"""
# Fit the wrapped estimator
self.estimator.fit(X, y, **kwargs)
# Get the properties of the dataset
self.n_samples = X.shape[0]
self.n_clusters = self.estimator.n_clusters
# Compute the scores of the cluster
labels = self.estimator.predict(X)
self.silhouette_score_ = silhouette_score(X, labels)
self.silhouette_samples_ = silhouette_samples(X, labels)
# Draw the silhouette figure
self.draw(labels)
# Return the estimator
return self
def ex3_kmeans(X, y):
""" Tries to find the best value for K when applying the KMeans algorithm
on X, y. There are multiple ways to score a model but here we count what is
the ratio of clusters with a negative Silhouette score and try to minimize
it, for K from 2 to 20.
Returns:
best_k: the value of K that gives the best score.
best_score: the score associated with best_k.
"""
best_k = 1
best_score = -1
for k in range(2, 20+1):
model = KMeans(k).fit(X, y)
scores = metrics.silhouette_samples(X, model.labels_)
negative_scores_count = len([x for x in scores if x < 0])
model_score = negative_scores_count / float(len(scores))
print "K=%d, score=%f" % (k, model_score)
if model_score > best_score:
best_score = model_score
best_k = k
# Unsurprisingly the best K is usually 2 because we have two classes of
# messages: spams and hams.
return best_k, best_score
# Ex 4
def plot_silhouettes(X, y):
cluster_labels = np.unique(y)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X, y, metric='euclidean')
y_ax_lower = 0
y_ax_upper = 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y == c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(i / n_clusters)
plt.barh(
range(y_ax_lower, y_ax_upper),
c_silhouette_vals,
height=1.0,
edgecolor='none',
color=color,
)
yticks.append((y_ax_lower + y_ax_upper) / 2)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color='red', linestyle='--')
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()