python类silhouette_score()的实例源码

plot_kmeans_digits.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))
clusters.py 文件源码 项目:idealoom 作者: conversence 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def get_all_results(self):
        discussion = self.discussion
        idea_ids = discussion.db.query(Idea.id).filter_by(
            discussion_id=discussion.id).all()
        results = {id: self.get_cluster_info(id)
                   for (id,) in idea_ids}
        results[None] = self.get_cluster_info()
        posres = {id: r for (id, r) in results.items() if r is not None}
        # for id, (silhouette_score, compare_with_ideas, clusters, post_info) in posres.iteritems():
        #     log.debug(" ".join((id, silhouette_score, repr([len(x['cluster']) for x in clusters]))))
        return posres
clusters.py 文件源码 项目:idealoom 作者: conversence 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def silhouette_score(self):
        if self._silhouette_score is None:
            self._silhouette_score = metrics.silhouette_score(
                self.model_matrix,
                self.optics.as_labels(self.optics_clusters),
                metric=self.metric)
        return self._silhouette_score
analyse_xml.py 文件源码 项目:py4design 作者: chenkianwee 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def elbow_test(X, max_cluster):
    """
    This function performs the elbow test to determine the number of clusters for k-means clustering.

    Parameters
    ----------           
    X : numpy array
        2d list of floats.  

    max_cluster : int
        The maximum number of clusters to desirable.

    Returns
    -------
    number of clusters : int
        The number of clusters for kmeans clustering
    """
    from sklearn.cluster import KMeans
    from sklearn import metrics
    inertia_list = []
    s_list = []
    for cluster_cnt in range(max_cluster-1):
        k_means = KMeans(n_clusters=cluster_cnt+2)
        k_means.fit(X)
        k_means_labels = k_means.labels_
        s_factor = metrics.silhouette_score(X, k_means_labels, metric='euclidean')
        s_list.append(s_factor)
        kmeans_inertia = k_means.inertia_
        inertia_list.append(kmeans_inertia)

    inertia_cnt = 0
    i_diff_list = []
    for inertia in inertia_list:
        #look for the difference between each difference in cluster number
        if inertia_cnt != len(inertia_list) - 1:
            i_diff = inertia - inertia_list[inertia_cnt + 1]
            i_diff_list.append(i_diff)
        inertia_cnt = inertia_cnt + 1

    #find the biggest difference and use that number for the best number of cluster
    max_diff = max(i_diff_list)
    max_diff_index = i_diff_list.index(max_diff)
    #+3 because of the counting 
    best_no_cluster = max_diff_index + 3
    return best_no_cluster
elbow.py 文件源码 项目:yellowbrick 作者: DistrictDataLabs 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def distortion_score(X, labels, metric='euclidean'):
    """
    Compute the mean distortion of all samples.

    The distortion is computed as the the sum of the squared distances between
    each observation and its closest centroid. Logically, this is the metric
    that K-Means attempts to minimize as it is fitting the model.

    .. seealso:: http://kldavenport.com/the-cost-function-of-k-means/

    Parameters
    ----------
    X : array, shape = [n_samples, n_features] or [n_samples_a, n_samples_a]
        Array of pairwise distances between samples if metric == "precomputed"
        or a feature array for computing distances against the labels.

    labels : array, shape = [n_samples]
        Predicted labels for each sample

    metric : string
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by `sklearn.metrics.pairwise.pairwise_distances
        <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html#sklearn.metrics.pairwise.pairwise_distances>`_

    .. todo:: add sample_size and random_state kwds similar to silhouette_score
    """
    # Encode labels to get unique centers and groups
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    unique_labels = le.classes_

    # Sum of the distortions
    distortion = 0

    # Loop through each label (center) to compute the centroid
    for current_label in unique_labels:
        # Mask the instances that belong to the current label
        mask = labels == current_label
        instances = X[mask]

        # Compute the center of these instances
        center = instances.mean(axis=0)

        # Compute the square distances from the instances to the center
        distances = pairwise_distances(instances, [center], metric=metric)
        distances = distances ** 2

        # Add the mean square distance to the distortion
        distortion += distances.mean()

    return distortion


##########################################################################
## Elbow Method
##########################################################################
ml_algorithms.py 文件源码 项目:webdataconnector_ml 作者: DoubleEE 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def runClustering(cluster_df):
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score as silhouette_score

    Xcols = [col for col in cluster_df.columns if 'NOTMODEL' not in col.upper()]

    # Convert character columns to dummy variables
    X = cluster_df[Xcols]
    cols = X.columns
    num_cols = X._get_numeric_data().columns
    char_cols = list(set(cols) - set(num_cols))
    for col in char_cols:
        if len(X[col].unique()) <= 20:
            dummy = pd.get_dummies(X[col], prefix='dm' + col)
            column_name = X.columns.values.tolist()
            column_name.remove(col)
            X = X[column_name].join(dummy)
        else:
            if col in X.columns:    # If more than 20 distinct values then delete
                del X[col]

    # Standardize (Z-score normalize) all continuous variables
    from scipy.stats import zscore
    for col in X:
        if len(X[col].unique()) > 2:    # Standardize non-dummy variables
            col_zscore = 'z_' + col
            X[col_zscore] = zscore(X[col])
            del X[col]

    # Fill missing values with 0 = the mean in the z-normalize data
    # Obviously missing values can be handled in many different ways
    X.fillna(0, inplace=True)

    # convert to matrix/numpy array to use in KMeans clustering class
    data_for_clustering_matrix = X.as_matrix()

    number_of_Clusters = []
    silhouette_value = []
    # Loop through 2 and 20 clusters and identify which has the highest silhouette score
    k = range(2, 21)
    for i in k:
        clustering_method = KMeans(n_clusters=i)
        clustering_method.fit(data_for_clustering_matrix)
        labels = clustering_method.predict(data_for_clustering_matrix)
        silhouette_average = silhouette_score(data_for_clustering_matrix, labels)
        silhouette_value.append(silhouette_average)
        number_of_Clusters.append(int(i))

        # maxind = np.argmax(silhouette_value)
        max_value = max(silhouette_value)
        indexMaxValue = silhouette_value.index(max_value)

        # FIT KMEANS CLUSTER MODEL WITH NUMBER OF CLUSTERS WITH HIGHEST SILHOUETTE SCORE
        clustering_method = KMeans(n_clusters=number_of_Clusters[indexMaxValue])
        clustering_method.fit(data_for_clustering_matrix)
        labels = clustering_method.predict(data_for_clustering_matrix)

        # SCORE THE DATAFRAME  score_df
        cluster_df['cluster'] = labels
        return cluster_df
AffinityPropagation.py 文件源码 项目:ProjectOfDataMining 作者: IljaNovo 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def compute_affinity_propagation(preference_, X):
    # DATA FILLING
    #text = io.Input.local_read_text_file(inputFilePath)
    #input_array = text.split('\n')
    centers = [[1, 1], [-1, -1], [1, -1]]
    n_samples = 300
    #Make Blobs used for generating of labels_true array
    if (X == None):
        X, labels_true = make_blobs(n_samples = n_samples, centers=centers, cluster_std=1, random_state=0)
        print("Data is none!!!")
        print("Generating " + str(n_samples) + " samples")
    else :
        data, labels_true = make_blobs(n_samples=len(X), centers=centers, cluster_std=1, random_state=0)
    #slist = list()
    #for line in X:
    #    slist.append(line)
    #io.Output.write_array_to_txt_file("clustering\\Affinity_Propagation\\input_data1.txt", slist)
    #float_array = []
    #for line in input_array:
    #    float_line = [float(i) for i in line.split(' ')]
    #    float_array.append(float_line)
    #X = array(float_array)

    af = AffinityPropagation(preference=preference_).fit(X)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    n_clusters_ = len(cluster_centers_indices)
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
#    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
    print("Fowlkes Mallows Score: %0.3f" % metrics.fowlkes_mallows_score(labels_true, labels))

    plt.close('all')
    plt.figure(1)
    plt.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        class_members = labels == k
        cluster_center = X[cluster_centers_indices[k]]
        plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
        for x in X[class_members]:
            plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
Clustering.py 文件源码 项目:GRIPy 作者: giruenf 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def expectation_maximization(data, nc, cv_type='full', req_info=None):
    gmm = GMM(n_components=nc, covariance_type=cv_type, thresh=1.0E-4, n_init=10)
    gmm.fit(data)

    labels = gmm.predict(data)

    if req_info == 'all':
        req_info = ['aic', 'bic', 'converged', 'weights', 'means', 'covars',
                    'silhouette', 'proba']
    elif req_info is None:
        req_info = []

    info = {}
    if 'aic' in req_info:
        info['aic'] = gmm.aic(data)
    if 'bic' in req_info:
        info['bic'] = gmm.bic(data)
    if 'converged' in req_info:
        info['converged'] = gmm.converged_
    if 'weights' in req_info:
        info['weights'] = gmm.weights_
    if 'means' in req_info:
        info['means'] = gmm.means_
    if 'covars' in req_info:
        if cv_type == 'full':
            info['covars'] = gmm.covars_
        elif cv_type == 'tied':
            cov = np.empty((nc, gmm.covars_.shape[0], gmm.covars_.shape[1]))
            for i in range(nc):
                cov[i] = gmm.covars_.copy()
            info['covars'] = cov
        else:
            cov = np.empty((nc, gmm.covars_.shape[0], gmm.covars_.shape[1]))
            for i in range(nc):
                cov[i] = np.diag(gmm.covars_[i])
            info['covars'] = cov
    if 'silhouette' in req_info:
        info['silhouette'] = metrics.silhouette_score(data, labels)
    if 'proba' in req_info:
        info['proba'] = gmm.predict_proba(data).T

    return labels, info
cluster_tools.py 文件源码 项目:SUPPA 作者: comprna 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def calculate_cluster_scores(x, cluster_labels, output):

    with open("%s_scores.log" % output, "w+") as fh:
        # Filter out singleton "cluster" (labeled as -1)
        filtered_x, filtered_cluster_labels, singletons = ([] for _ in range(3))
        cluster_groups = defaultdict(list)
        for vec, lab in zip(x, cluster_labels):
            if not lab == -1:
                filtered_x.append(vec)
                filtered_cluster_labels.append(lab)

                cluster_groups[lab].append(vec)
            else:
                singletons.append(vec)

        ln = "Number of clustered events: %d/%d (%f%%)\n" % (len(filtered_x), len(filtered_x)+len(singletons),
                                                           (len(filtered_x)/(len(filtered_x)+len(singletons)))*100)
        print(ln.strip("\n"))
        fh.write(ln)

        for group in cluster_groups:
                n_events = len(cluster_groups[group])
                ln = "Cluster %d contains %d events\n" % (group, n_events)
                print(ln.strip("\n"))
                fh.write(ln)

        rmsstd_scores = []
        for group in cluster_groups:
            rmsstd = calculate_rmsstd(np.array(cluster_groups[group]))
            ln = "The RMSSTD score for cluster %d is %f\n" % (group, rmsstd)
            print(ln.strip("\n"))
            fh.write(ln)

            rmsstd_scores.append(rmsstd)

        try:
            silhouette_avg = silhouette_score(np.array(filtered_x), np.array(filtered_cluster_labels))
            ln = "The average silhouette score is : %f\n" % silhouette_avg
            print(ln.strip("\n"))
            fh.write(ln)
        except:
            silhouette_avg = float("nan")
            ln = "Impossible to calculate silhouette score. Only 1 cluster group identified.\n"
            print(ln.strip("\n"))
            fh.write(ln)

    return silhouette_avg, rmsstd_scores


问题


面经


文章

微信
公众号

扫码关注公众号