python类silhouette_samples()的实例源码-面圈网

Silhouette.py 文件源码项目：SecuML 作者: ANSSI-FR 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def generateEvaluation(self, output_dir, assigned_clusters, quick = False):
        if quick:
            self.silhouette_avg = 0
            return
        if self.distances is not None:
            self.sample_silhouette_values = silhouette_samples(
                    self.distances, assigned_clusters,
                    metric = 'precomputed')
        else:
            self.sample_silhouette_values = silhouette_samples(self.instances.getFeatures(),
                    assigned_clusters)
        self.silhouette_avg = np.mean(self.sample_silhouette_values)
        self.printSilhouette(output_dir, assigned_clusters)

    # Code from a scikit-learn example:
    # Selecting the number of clusters with silhouette analysis on KMeans clustering

silhouette.py 文件源码项目：yellowbrick 作者: DistrictDataLabs 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def fit(self, X, y=None, **kwargs):
        """
        Fits the model and generates the the silhouette visualization.

        TODO: decide to use this method or the score method to draw.
        NOTE: Probably this would be better in score, but the standard score
        is a little different and I'm not sure how it's used.
        """
        # Fit the wrapped estimator
        self.estimator.fit(X, y, **kwargs)

        # Get the properties of the dataset
        self.n_samples = X.shape[0]
        self.n_clusters = self.estimator.n_clusters

        # Compute the scores of the cluster
        labels = self.estimator.predict(X)
        self.silhouette_score_ = silhouette_score(X, labels)
        self.silhouette_samples_ = silhouette_samples(X, labels)

        # Draw the silhouette figure
        self.draw(labels)

        # Return the estimator
        return self

tp2_solutions.py 文件源码项目：TPs 作者: DataMiningP7 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def ex3_kmeans(X, y):
    """ Tries to find the best value for K when applying the KMeans algorithm
    on X, y. There are multiple ways to score a model but here we count what is
    the ratio of clusters with a negative Silhouette score and try to minimize
    it, for K from 2 to 20.

    Returns:
        best_k: the value of K that gives the best score.
        best_score: the score associated with best_k.
    """
    best_k = 1
    best_score = -1

    for k in range(2, 20+1):
        model = KMeans(k).fit(X, y)

        scores = metrics.silhouette_samples(X, model.labels_)
        negative_scores_count = len([x for x in scores if x < 0])
        model_score = negative_scores_count / float(len(scores))

        print "K=%d, score=%f" % (k, model_score)

        if model_score > best_score:
            best_score = model_score
            best_k = k

    # Unsurprisingly the best K is usually 2 because we have two classes of
    # messages: spams and hams.
    return best_k, best_score


# Ex 4

chapter_11.py 文件源码项目：python-machine-learning-book 作者: jeremyn 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def plot_silhouettes(X, y):
    cluster_labels = np.unique(y)
    n_clusters = cluster_labels.shape[0]
    silhouette_vals = silhouette_samples(X, y, metric='euclidean')
    y_ax_lower = 0
    y_ax_upper = 0
    yticks = []
    for i, c in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[y == c]
        c_silhouette_vals.sort()
        y_ax_upper += len(c_silhouette_vals)
        color = cm.jet(i / n_clusters)
        plt.barh(
            range(y_ax_lower, y_ax_upper),
            c_silhouette_vals,
            height=1.0,
            edgecolor='none',
            color=color,
        )
        yticks.append((y_ax_lower + y_ax_upper) / 2)
        y_ax_lower += len(c_silhouette_vals)

    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(silhouette_avg, color='red', linestyle='--')

    plt.yticks(yticks, cluster_labels + 1)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coefficient')

    plt.show()