def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5):
if len(patch) == 1:
return [patch]
if statistic == 'db':
if method == 'kmeans':
if len(patch) <= 5:
K_max = 2
else:
K_max = min(len(patch) / 2, max_K)
clustering = {}
db_index = []
X = df.ix[patch, :]
for k in range(2, K_max + 1):
kmeans = cluster.KMeans(n_clusters=k).fit(X)
clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch)
dist_mu = squareform(pdist(kmeans.cluster_centers_))
sigma = []
for i in range(k):
points_in_cluster = clustering[k][clustering[k][0] == i].index
sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
db_index = np.array(db_index)
k_optimal = np.argmin(db_index) + 2
return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]
elif method == 'agglomerative':
if len(patch) <= 5:
K_max = 2
else:
K_max = min(len(patch) / 2, max_K)
clustering = {}
db_index = []
X = df.ix[patch, :]
for k in range(2, K_max + 1):
agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X)
clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch)
tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)]
centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp])
dist_mu = squareform(pdist(centers))
sigma = []
for i in range(k):
points_in_cluster = clustering[k][clustering[k][0] == i].index
sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
db_index = np.array(db_index)
k_optimal = np.argmin(db_index) + 2
return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]
elif statistic == 'gap':
X = np.array(df.ix[patch, :])
if method == 'kmeans':
f = cluster.KMeans
gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f)
k_optimal = list(gaps).index(max(gaps))+1
clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch)
return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)]
else:
raise 'error: only db and gat statistics are supported'
python类AgglomerativeClustering()的实例源码
def plot_cluster(reduced_data, cluster_type, k_clusters, plot_title):
if cluster_type.lower() == "kmeans":
clus = KMeans(init='k-means++', n_clusters=k_clusters, n_init=10)
elif cluster_type.lower() == "agglom":
clus = AgglomerativeClustering(n_clusters = k_clusters)
clus.fit(reduced_data)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = clus.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(15,10))
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=10)
if cluster_type.lower() == "kmeans":
# Plot the centroids as a white X
centroids = clus.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title(plot_title)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()