def bench_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f'
% (name, (time() - t0), estimator.inertia_,
metrics.homogeneity_score(labels, estimator.labels_),
metrics.completeness_score(labels, estimator.labels_),
metrics.v_measure_score(labels, estimator.labels_),
metrics.adjusted_rand_score(labels, estimator.labels_),
metrics.adjusted_mutual_info_score(labels, estimator.labels_),
metrics.silhouette_score(data, estimator.labels_,
metric='euclidean',
sample_size=sample_size)))
python类silhouette_score()的实例源码
def get_all_results(self):
discussion = self.discussion
idea_ids = discussion.db.query(Idea.id).filter_by(
discussion_id=discussion.id).all()
results = {id: self.get_cluster_info(id)
for (id,) in idea_ids}
results[None] = self.get_cluster_info()
posres = {id: r for (id, r) in results.items() if r is not None}
# for id, (silhouette_score, compare_with_ideas, clusters, post_info) in posres.iteritems():
# log.debug(" ".join((id, silhouette_score, repr([len(x['cluster']) for x in clusters]))))
return posres
def silhouette_score(self):
if self._silhouette_score is None:
self._silhouette_score = metrics.silhouette_score(
self.model_matrix,
self.optics.as_labels(self.optics_clusters),
metric=self.metric)
return self._silhouette_score
def elbow_test(X, max_cluster):
"""
This function performs the elbow test to determine the number of clusters for k-means clustering.
Parameters
----------
X : numpy array
2d list of floats.
max_cluster : int
The maximum number of clusters to desirable.
Returns
-------
number of clusters : int
The number of clusters for kmeans clustering
"""
from sklearn.cluster import KMeans
from sklearn import metrics
inertia_list = []
s_list = []
for cluster_cnt in range(max_cluster-1):
k_means = KMeans(n_clusters=cluster_cnt+2)
k_means.fit(X)
k_means_labels = k_means.labels_
s_factor = metrics.silhouette_score(X, k_means_labels, metric='euclidean')
s_list.append(s_factor)
kmeans_inertia = k_means.inertia_
inertia_list.append(kmeans_inertia)
inertia_cnt = 0
i_diff_list = []
for inertia in inertia_list:
#look for the difference between each difference in cluster number
if inertia_cnt != len(inertia_list) - 1:
i_diff = inertia - inertia_list[inertia_cnt + 1]
i_diff_list.append(i_diff)
inertia_cnt = inertia_cnt + 1
#find the biggest difference and use that number for the best number of cluster
max_diff = max(i_diff_list)
max_diff_index = i_diff_list.index(max_diff)
#+3 because of the counting
best_no_cluster = max_diff_index + 3
return best_no_cluster
def distortion_score(X, labels, metric='euclidean'):
"""
Compute the mean distortion of all samples.
The distortion is computed as the the sum of the squared distances between
each observation and its closest centroid. Logically, this is the metric
that K-Means attempts to minimize as it is fitting the model.
.. seealso:: http://kldavenport.com/the-cost-function-of-k-means/
Parameters
----------
X : array, shape = [n_samples, n_features] or [n_samples_a, n_samples_a]
Array of pairwise distances between samples if metric == "precomputed"
or a feature array for computing distances against the labels.
labels : array, shape = [n_samples]
Predicted labels for each sample
metric : string
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by `sklearn.metrics.pairwise.pairwise_distances
<http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html#sklearn.metrics.pairwise.pairwise_distances>`_
.. todo:: add sample_size and random_state kwds similar to silhouette_score
"""
# Encode labels to get unique centers and groups
le = LabelEncoder()
labels = le.fit_transform(labels)
unique_labels = le.classes_
# Sum of the distortions
distortion = 0
# Loop through each label (center) to compute the centroid
for current_label in unique_labels:
# Mask the instances that belong to the current label
mask = labels == current_label
instances = X[mask]
# Compute the center of these instances
center = instances.mean(axis=0)
# Compute the square distances from the instances to the center
distances = pairwise_distances(instances, [center], metric=metric)
distances = distances ** 2
# Add the mean square distance to the distortion
distortion += distances.mean()
return distortion
##########################################################################
## Elbow Method
##########################################################################
def runClustering(cluster_df):
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score as silhouette_score
Xcols = [col for col in cluster_df.columns if 'NOTMODEL' not in col.upper()]
# Convert character columns to dummy variables
X = cluster_df[Xcols]
cols = X.columns
num_cols = X._get_numeric_data().columns
char_cols = list(set(cols) - set(num_cols))
for col in char_cols:
if len(X[col].unique()) <= 20:
dummy = pd.get_dummies(X[col], prefix='dm' + col)
column_name = X.columns.values.tolist()
column_name.remove(col)
X = X[column_name].join(dummy)
else:
if col in X.columns: # If more than 20 distinct values then delete
del X[col]
# Standardize (Z-score normalize) all continuous variables
from scipy.stats import zscore
for col in X:
if len(X[col].unique()) > 2: # Standardize non-dummy variables
col_zscore = 'z_' + col
X[col_zscore] = zscore(X[col])
del X[col]
# Fill missing values with 0 = the mean in the z-normalize data
# Obviously missing values can be handled in many different ways
X.fillna(0, inplace=True)
# convert to matrix/numpy array to use in KMeans clustering class
data_for_clustering_matrix = X.as_matrix()
number_of_Clusters = []
silhouette_value = []
# Loop through 2 and 20 clusters and identify which has the highest silhouette score
k = range(2, 21)
for i in k:
clustering_method = KMeans(n_clusters=i)
clustering_method.fit(data_for_clustering_matrix)
labels = clustering_method.predict(data_for_clustering_matrix)
silhouette_average = silhouette_score(data_for_clustering_matrix, labels)
silhouette_value.append(silhouette_average)
number_of_Clusters.append(int(i))
# maxind = np.argmax(silhouette_value)
max_value = max(silhouette_value)
indexMaxValue = silhouette_value.index(max_value)
# FIT KMEANS CLUSTER MODEL WITH NUMBER OF CLUSTERS WITH HIGHEST SILHOUETTE SCORE
clustering_method = KMeans(n_clusters=number_of_Clusters[indexMaxValue])
clustering_method.fit(data_for_clustering_matrix)
labels = clustering_method.predict(data_for_clustering_matrix)
# SCORE THE DATAFRAME score_df
cluster_df['cluster'] = labels
return cluster_df
def compute_affinity_propagation(preference_, X):
# DATA FILLING
#text = io.Input.local_read_text_file(inputFilePath)
#input_array = text.split('\n')
centers = [[1, 1], [-1, -1], [1, -1]]
n_samples = 300
#Make Blobs used for generating of labels_true array
if (X == None):
X, labels_true = make_blobs(n_samples = n_samples, centers=centers, cluster_std=1, random_state=0)
print("Data is none!!!")
print("Generating " + str(n_samples) + " samples")
else :
data, labels_true = make_blobs(n_samples=len(X), centers=centers, cluster_std=1, random_state=0)
#slist = list()
#for line in X:
# slist.append(line)
#io.Output.write_array_to_txt_file("clustering\\Affinity_Propagation\\input_data1.txt", slist)
#float_array = []
#for line in input_array:
# float_line = [float(i) for i in line.split(' ')]
# float_array.append(float_line)
#X = array(float_array)
af = AffinityPropagation(preference=preference_).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
# print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
print("Fowlkes Mallows Score: %0.3f" % metrics.fowlkes_mallows_score(labels_true, labels))
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
def expectation_maximization(data, nc, cv_type='full', req_info=None):
gmm = GMM(n_components=nc, covariance_type=cv_type, thresh=1.0E-4, n_init=10)
gmm.fit(data)
labels = gmm.predict(data)
if req_info == 'all':
req_info = ['aic', 'bic', 'converged', 'weights', 'means', 'covars',
'silhouette', 'proba']
elif req_info is None:
req_info = []
info = {}
if 'aic' in req_info:
info['aic'] = gmm.aic(data)
if 'bic' in req_info:
info['bic'] = gmm.bic(data)
if 'converged' in req_info:
info['converged'] = gmm.converged_
if 'weights' in req_info:
info['weights'] = gmm.weights_
if 'means' in req_info:
info['means'] = gmm.means_
if 'covars' in req_info:
if cv_type == 'full':
info['covars'] = gmm.covars_
elif cv_type == 'tied':
cov = np.empty((nc, gmm.covars_.shape[0], gmm.covars_.shape[1]))
for i in range(nc):
cov[i] = gmm.covars_.copy()
info['covars'] = cov
else:
cov = np.empty((nc, gmm.covars_.shape[0], gmm.covars_.shape[1]))
for i in range(nc):
cov[i] = np.diag(gmm.covars_[i])
info['covars'] = cov
if 'silhouette' in req_info:
info['silhouette'] = metrics.silhouette_score(data, labels)
if 'proba' in req_info:
info['proba'] = gmm.predict_proba(data).T
return labels, info
def calculate_cluster_scores(x, cluster_labels, output):
with open("%s_scores.log" % output, "w+") as fh:
# Filter out singleton "cluster" (labeled as -1)
filtered_x, filtered_cluster_labels, singletons = ([] for _ in range(3))
cluster_groups = defaultdict(list)
for vec, lab in zip(x, cluster_labels):
if not lab == -1:
filtered_x.append(vec)
filtered_cluster_labels.append(lab)
cluster_groups[lab].append(vec)
else:
singletons.append(vec)
ln = "Number of clustered events: %d/%d (%f%%)\n" % (len(filtered_x), len(filtered_x)+len(singletons),
(len(filtered_x)/(len(filtered_x)+len(singletons)))*100)
print(ln.strip("\n"))
fh.write(ln)
for group in cluster_groups:
n_events = len(cluster_groups[group])
ln = "Cluster %d contains %d events\n" % (group, n_events)
print(ln.strip("\n"))
fh.write(ln)
rmsstd_scores = []
for group in cluster_groups:
rmsstd = calculate_rmsstd(np.array(cluster_groups[group]))
ln = "The RMSSTD score for cluster %d is %f\n" % (group, rmsstd)
print(ln.strip("\n"))
fh.write(ln)
rmsstd_scores.append(rmsstd)
try:
silhouette_avg = silhouette_score(np.array(filtered_x), np.array(filtered_cluster_labels))
ln = "The average silhouette score is : %f\n" % silhouette_avg
print(ln.strip("\n"))
fh.write(ln)
except:
silhouette_avg = float("nan")
ln = "Impossible to calculate silhouette score. Only 1 cluster group identified.\n"
print(ln.strip("\n"))
fh.write(ln)
return silhouette_avg, rmsstd_scores