python类AgglomerativeClustering()的实例源码

network.py 文件源码 项目:sakmapper 作者: szairis 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5):
    if len(patch) == 1:
        return [patch]

    if statistic == 'db':
        if method == 'kmeans':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                kmeans = cluster.KMeans(n_clusters=k).fit(X)
                clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch)
                dist_mu = squareform(pdist(kmeans.cluster_centers_))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

        elif method == 'agglomerative':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X)
                clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch)
                tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)]
                centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp])
                dist_mu = squareform(pdist(centers))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

    elif statistic == 'gap':
        X = np.array(df.ix[patch, :])
        if method == 'kmeans':
            f = cluster.KMeans
        gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f)
        k_optimal = list(gaps).index(max(gaps))+1
        clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch)
        return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)]

    else:
        raise 'error: only db and gat statistics are supported'
cluster.py 文件源码 项目:NBAPlayerValue 作者: TWanish 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def plot_cluster(reduced_data, cluster_type, k_clusters, plot_title):
    if cluster_type.lower() == "kmeans":
        clus = KMeans(init='k-means++', n_clusters=k_clusters, n_init=10)

    elif cluster_type.lower() == "agglom":
        clus = AgglomerativeClustering(n_clusters = k_clusters)

    clus.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = clus.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(15,10))
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=10)

    if cluster_type.lower() == "kmeans":
        # Plot the centroids as a white X
        centroids = clus.cluster_centers_
        plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)

    plt.title(plot_title)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()


问题


面经


文章

微信
公众号

扫码关注公众号