def silhouette_score(series, clusters):
distances = np.zeros((series.shape[0], series.shape[0]))
for idx_a, metric_a in enumerate(series):
for idx_b, metric_b in enumerate(series):
distances[idx_a, idx_b] = _sbd(metric_a, metric_b)[0]
labels = np.zeros(series.shape[0])
for i, (cluster, indicies) in enumerate(clusters):
for index in indicies:
labels[index] = i
# silhouette is only defined, if we have 2 clusters with assignments at
# minimum
if len(np.unique(labels)) == 1 or (len(np.unique(labels)) >= distances.shape[0]):
#if len(np.unique(labels)) == 1:
return labels, -1
else:
return labels, _silhouette_score(distances, labels, metric='precomputed')
python类silhouette_score()的实例源码
def spectral(data):
spectral = SpectralClustering(
eigen_solver='arpack',
affinity='rbf',
assign_labels='discretize'
).fit(data)
print 'Spectral'
print collections.Counter(spectral.labels_)
print metrics.silhouette_score(data, spectral.labels_)
reduced_data = reduce_with_pca(data, 2)
plot_2d_data(reduced_data, spectral.labels_)
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'):
from sklearn.cross_validation import train_test_split
from sklearn.metrics import silhouette_score
shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers))
train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio)
train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values))
full_mat = np.array(list(shape_df.values))
centroids = None
labels = None
best_score = 0
for k in k_range:
res = cluster_shapes(train_mat, full_mat, k)
score = silhouette_score(full_mat, res[1])
if score > best_score:
centroids = res[0]
labels = res[1]
best_score = score
mols[cluster_key] = labels
return mols, centroids
def _find_optimal_clustering(self,clusterings):
max_score = float('-inf')
max_clustering = None
for clustering in clusterings:
labeled_vectors = [(node.vector,cluster_idx) for cluster_idx in range(len(clustering)) for node in _get_cluster_nodes(clustering[cluster_idx][1]) ]
vectors,labels = [np.array(x) for x in zip(*labeled_vectors)]
if np.in1d([1],labels)[0]:
score = silhouette_score(vectors,labels,metric='cosine')
else:
continue # silhouette doesn't work with just one cluster
if score > max_score:
max_score = score
max_clustering = clustering
return zip(*max_clustering)[1] if max_clustering else zip(*clusterings[0])[1]
def evaluate_kmeans(X, model):
""" Evaluate a K-Means model that has been trained on X using the
Silhouette score.
Args:
X: the TF-IDF matrix where each line represents a document and each
column represents a word, typically obtained by running
transform_text() from the TP2.
model: the KMeans model trained on X.
Returns:
A double that corresponds to the Silhouette score of the model.
"""
return silhouette_score(X, model.labels_)
# Ex2
def fit(self, X, y=None, **kwargs):
"""
Fits the model and generates the the silhouette visualization.
TODO: decide to use this method or the score method to draw.
NOTE: Probably this would be better in score, but the standard score
is a little different and I'm not sure how it's used.
"""
# Fit the wrapped estimator
self.estimator.fit(X, y, **kwargs)
# Get the properties of the dataset
self.n_samples = X.shape[0]
self.n_clusters = self.estimator.n_clusters
# Compute the scores of the cluster
labels = self.estimator.predict(X)
self.silhouette_score_ = silhouette_score(X, labels)
self.silhouette_samples_ = silhouette_samples(X, labels)
# Draw the silhouette figure
self.draw(labels)
# Return the estimator
return self
def clustering( points, k=2,name='kmeans'):
'''
points: N_samples * N_features
k: number of clusters
'''
if name == 'kmeans':
kmeans = KMeans( n_clusters=k,n_init=100 ).fit(points)
## print within_variance
#cluster_distance = kmeans.transform( points )
#within_variance = sum( np.min(cluster_distance,axis=1) ) / float( points.shape[0] )
#print("AvgWithinSS:"+str(within_variance))
if len( np.unique(kmeans.labels_) ) > 1:
si = silhouette_score( points,kmeans.labels_ )
#print("Silhouette:"+str(si))
else:
si = 0
print("Silhouette:"+str(si))
return kmeans.labels_,si
if name == 'spec':
spec= SpectralClustering( n_clusters=k,affinity='cosine' ).fit( points )
si = silhouette_score( points,spec.labels_ )
print("Silhouette:"+str(si))
return spec.labels_,si
def k_means(data, nc, req_info=None):
means = np.mean(data, axis=0)
stds = np.std(data, axis=0)
sdata = (data - means)/stds
km = KMeans(init='k-means++', n_clusters=nc, n_init=10)
km.fit(sdata)
if req_info == 'all':
req_info = ['silhouette', 'inertia', 'centers']
elif req_info is None:
req_info = []
info = {}
if 'silhouette' in req_info:
info['silhouette'] = metrics.silhouette_score(data, km.labels_)
if 'inertia' in req_info:
info['inertia'] = km.inertia_
if 'centers' in req_info:
info['centers'] = km.cluster_centers_*stds + means
return km.labels_, info
def internal_silhouette(self, idea_id, base_labels=None):
labels = self.labels_for_idea(idea_id, True, False, base_labels)
self.remove_singletons(labels, idea_id)
idea_post_ids = self.get_posts_of_idea(idea_id)
if base_labels:
idea_post_ids = set(idea_post_ids)
idea_post_ids.update(list(base_labels.keys()))
idea_post_ids = np.array(list(idea_post_ids))
idea_post_ids.sort()
idea_post_ids = np.array(idea_post_ids)
idea_post_nums = self.post_ids.searchsorted(idea_post_ids)
# slicing one axis of a time
# because simultaneous slice interpreted as diagonal
distances = self.distance_matrix
sub_distance = distances[idea_post_nums][:, idea_post_nums]
sub_labels = labels[idea_post_nums]
if len(set(sub_labels)) < 2:
return 0
return metrics.silhouette_score(sub_distance, sub_labels, 'precomputed')
def bench_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f'
% (name, (time() - t0), estimator.inertia_,
metrics.homogeneity_score(labels, estimator.labels_),
metrics.completeness_score(labels, estimator.labels_),
metrics.v_measure_score(labels, estimator.labels_),
metrics.adjusted_rand_score(labels, estimator.labels_),
metrics.adjusted_mutual_info_score(labels, estimator.labels_),
metrics.silhouette_score(data, estimator.labels_,
metric='euclidean',
sample_size=sample_size)))
def db_scan(data, eps, min_samples, metric):
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(data)
print 'DBSCAN'
print metrics.silhouette_score(data, dbscan.labels_)
print collections.Counter(dbscan.labels_)
reduced_data = reduce_with_pca(data)
plot_2d_data(reduced_data, dbscan.labels_)
def mean_shift(data):
mean_shift = MeanShift(cluster_all=False, n_jobs=1).fit(data)
print 'Mean Shift'
print metrics.silhouette_score(data, mean_shift.labels_)
print collections.Counter(mean_shift.labels_)
def affinity_prop(data):
af = AffinityPropagation(damping=0.5, convergence_iter=15, affinity='euclidean').fit(data)
print 'Affinity Propagation'
print metrics.silhouette_score(data, af.labels_)
print collections.Counter(af.labels_)
# mean_shift(np.array(data))
# affinity_prop(np.array(data))
def cluster2d(data, n_clusters):
reduced_data = reduce_with_pca(data)
kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(reduced_data)
print 'K-Means'
print collections.Counter(kmeans.labels_)
print metrics.silhouette_score(data, kmeans.labels_)
plot_2d_data(reduced_data, kmeans.labels_)
def em(data):
gmm = GaussianMixture(
n_components=6,
covariance_type="tied"
).fit(data)
predicted_data = gmm.predict(data)
print collections.Counter(predicted_data)
print metrics.silhouette_score(data, predicted_data)
reduced_data = reduce_with_pca(data, 2)
plot_2d_data(reduced_data, predicted_data)
def kmeans(reduced_data, n_clusters):
#----Do KMeans clustering and return relevant graphing/performance data
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=42)
kmeans = kmeans.fit(reduced_data)
sil_score = metrics.silhouette_score(reduced_data, kmeans.labels_, metric='euclidean')
data_dictionary = {
"labels": kmeans.labels_,
"centroids": kmeans.cluster_centers_,
"silhouette_score": sil_score
}
return data_dictionary
def agglom(reduced_data, n_clusters):
#----Do Agglomerative clustering and return relevant performance data
clustering = cluster.AgglomerativeClustering(n_clusters = n_clusters)
clustering = clustering.fit(reduced_data)
sil_score = metrics.silhouette_score(reduced_data, clustering.labels_, metric='euclidean')
return {
"labels":clustering.labels_,
"silhouette_score": sil_score
}
def find_best_cluster(cluster_type,data,a,b):
#----Prints silhouette scores for all # of clusters in range
scores = []
for i in range(a,b):
if cluster_type.lower() == "kmeans":
i_clusters = kmeans(data, i)
elif cluster_type.lower() == "agglom":
i_clusters = agglom(data, i)
sil_score_i = i_clusters['silhouette_score']
scores.append(sil_score_i)
print(scores)
def clustering(docs,n_clusters): # ?? n_clusters ???
kmeans_model=KMeans(n_clusters=n_clusters,random_state=1).fit(docs) # kmeans??
labels=kmeans_model.labels_
# hmodel=AgglomerativeClustering(n_clusters=n_clusters).fit(docs) # ????
# labels=hmodel.labels_
score=metrics.silhouette_score(np.array(docs),labels,metric='euclidean') # euclidean ??
return labels,score
def analyze_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print(" %9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f"%( name, time()-t0, estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size = samples) ))
def evaluate(k):
km = kmeans[k]
score = silhouette_score(train_offsets, km.labels_, metric='euclidean', random_state=RANDOM_SEED)
print('Silhouette score for k=%d is %f.' % (k, score))
return (k, score)
def try_kmeans(X):
""" Run the K-Means algorithm on X with different values of K, and return
the one that gives the best score.
Args:
X: the TF-IDF matrix where each line represents a document and each
column represents a word, typically obtained by running
transform_text() from the TP2.
"""
best_k = 1
best_score = -1
for k in range(2, 20+1):
model = KMeans(n_clusters=k)
model.fit(X)
labels = model.predict(X)
score = silhouette_score(model.transform(X), labels)
print(k, "->", score)
if score > best_score:
best_k = k
best_score = score
print("The best K is", best_k)
return best_k
# Ex3
def ex2_kmeans(X, y):
""" Applies the KMeans algorithm on X, y using K=10 and print the
silhouette score of this model. X and y are returned by transform_text
above.
"""
model = KMeans(10).fit(X, y)
print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_)
# Ex 3
def ex4_agglomerative_clustering(X, y):
""" This does the same thing as ex2_kmeans but with an agglomerative
clustering and K=2.
"""
# AgglomerativeClustering needs a non-spare matrix
X = X.toarray()
k = 2
model = AgglomerativeClustering(k).fit(X, y)
print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_)
# Ex 5
def sk_kmeans(core): #, kval=3
solrURL = "http://localhost:8983/solr/" + core
solrInstance = Solr(solrURL)
list_of_points = []
docs = solrInstance.query_iterator(query="*:*", start=0)
for doc in docs:
list_of_points.append(Vector(doc['id'], doc))
list_of_Dicts = (point.features for point in list_of_points)
df = pd.DataFrame(list_of_Dicts)
df = df.fillna(0)
silhouettes = {}
for k in range(2, 10):
kmeans = KMeans(n_clusters=k,
init='k-means++',
max_iter=300, # k-means convergence
n_init=10, # find global minima
n_jobs=-2, # parallelize
)
labels = kmeans.fit_predict(df)
silhouettes[k] = silhouette_score(df, labels)
return str(silhouettes)
def nmf_test(df):
X = df.drop(['Year', 'zipcode'], axis=1).values
scaler = MinMaxScaler()
X_sca = scaler.fit_tranform(X)
scores = []
for k in xrange(2, 11):
model = NMF(n_components=k)
W = model.fit_transform(X_sca)
labels = W.argmax(axis=1)
score = silhouette_score(X_sca, labels)
scores.append(score)
plt.plot(xrange(2, 11), scores, 'b*-')
plt.show()
def silhouette(self):
"""
Calculate the silhouette score for a certain clustering.
Input: None
Output: silhouette score (None)
"""
return silhouette_score(self.features, self.cluster_labels)
def h_cluster(wordlist, sims, distmat, thresh=0.01):
B_, Bs, Ms, Ts, As = hgfc(sims, thresh=thresh)
sil_coefs = []
for i,a in enumerate(As):
l = labels(a)
if len(set(l)) > 2 and len(set(l)) < len(wordlist)-1:
sil_coefs.append(silhouette_score(distmat, labels(a), metric='precomputed'))
else:
sil_coefs.append(0.0)
ld = [labeldict(a,wordlist) for a in As]
return ld, sil_coefs
def scores(dmat,cluster_labels):
try:
silhouette_avg = silhouette_score(dmat, cluster_labels, metric='precomputed', sample_size=100)
return(silhouette_avg)
except:
return(None)
def silhcoeff(data,labels):
arrdata = array(data)
print("Silhouette coefficient: ", metrics.silhouette_score(arrdata,labels,metric='euclidean'))
################################### PHOTOS ###########################################
# LOAD PHOTOS FROM FOLDER & SAVE IN A LIST [FILENAME,PHOTO,GRAYSCALE_PHOTO]