def ex3_kmeans(X, y):
""" Tries to find the best value for K when applying the KMeans algorithm
on X, y. There are multiple ways to score a model but here we count what is
the ratio of clusters with a negative Silhouette score and try to minimize
it, for K from 2 to 20.
Returns:
best_k: the value of K that gives the best score.
best_score: the score associated with best_k.
"""
best_k = 1
best_score = -1
for k in range(2, 20+1):
model = KMeans(k).fit(X, y)
scores = metrics.silhouette_samples(X, model.labels_)
negative_scores_count = len([x for x in scores if x < 0])
model_score = negative_scores_count / float(len(scores))
print "K=%d, score=%f" % (k, model_score)
if model_score > best_score:
best_score = model_score
best_k = k
# Unsurprisingly the best K is usually 2 because we have two classes of
# messages: spams and hams.
return best_k, best_score
# Ex 4
评论列表
文章目录