python类silhouette_score()的实例源码

cluster.py 文件源码 项目:rca-evaluation 作者: sieve-microservices 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def silhouette_score(series, clusters):
    distances = np.zeros((series.shape[0], series.shape[0]))
    for idx_a, metric_a in enumerate(series):
        for idx_b, metric_b in enumerate(series):
            distances[idx_a, idx_b] = _sbd(metric_a, metric_b)[0]
    labels = np.zeros(series.shape[0])
    for i, (cluster, indicies) in enumerate(clusters):
        for index in indicies:
            labels[index] = i

    # silhouette is only defined, if we have 2 clusters with assignments at 
    # minimum
    if len(np.unique(labels)) == 1 or (len(np.unique(labels)) >= distances.shape[0]):
    #if len(np.unique(labels)) == 1:
        return labels, -1
    else:
        return labels, _silhouette_score(distances, labels, metric='precomputed')
spectral.py 文件源码 项目:lol-category 作者: vonum 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def spectral(data):
  spectral = SpectralClustering(
      eigen_solver='arpack',
      affinity='rbf',
      assign_labels='discretize'
  ).fit(data)

  print 'Spectral'
  print collections.Counter(spectral.labels_)
  print metrics.silhouette_score(data, spectral.labels_)

  reduced_data = reduce_with_pca(data, 2)
  plot_2d_data(reduced_data, spectral.labels_)
components.py 文件源码 项目:sptgraph 作者: epfl-lts2 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'):
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import silhouette_score

    shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers))
    train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio)

    train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values))
    full_mat = np.array(list(shape_df.values))

    centroids = None
    labels = None
    best_score = 0
    for k in k_range:
        res = cluster_shapes(train_mat, full_mat, k)
        score = silhouette_score(full_mat, res[1])
        if score > best_score:
            centroids = res[0]
            labels = res[1]
            best_score = score

    mols[cluster_key] = labels
    return mols, centroids
precluster.py 文件源码 项目:texta 作者: texta-tk 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def _find_optimal_clustering(self,clusterings):

        max_score = float('-inf')
        max_clustering = None

        for clustering in clusterings:
            labeled_vectors = [(node.vector,cluster_idx) for cluster_idx in range(len(clustering)) for node in _get_cluster_nodes(clustering[cluster_idx][1]) ]
            vectors,labels = [np.array(x) for x in zip(*labeled_vectors)]
            if np.in1d([1],labels)[0]:
                score = silhouette_score(vectors,labels,metric='cosine')
            else:
                continue # silhouette doesn't work with just one cluster
            if score > max_score:
                max_score = score
                max_clustering = clustering

        return zip(*max_clustering)[1] if max_clustering else zip(*clusterings[0])[1]
tp3_solutions.py 文件源码 项目:TPs 作者: DataMiningP7 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def evaluate_kmeans(X, model):
    """ Evaluate a K-Means model that has been trained on X using the
     Silhouette score.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
        model: the KMeans model trained on X.
    Returns:
        A double that corresponds to the Silhouette score of the model.
    """
    return silhouette_score(X, model.labels_)


# Ex2
silhouette.py 文件源码 项目:yellowbrick 作者: DistrictDataLabs 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fit(self, X, y=None, **kwargs):
        """
        Fits the model and generates the the silhouette visualization.

        TODO: decide to use this method or the score method to draw.
        NOTE: Probably this would be better in score, but the standard score
        is a little different and I'm not sure how it's used.
        """
        # Fit the wrapped estimator
        self.estimator.fit(X, y, **kwargs)

        # Get the properties of the dataset
        self.n_samples = X.shape[0]
        self.n_clusters = self.estimator.n_clusters

        # Compute the scores of the cluster
        labels = self.estimator.predict(X)
        self.silhouette_score_ = silhouette_score(X, labels)
        self.silhouette_samples_ = silhouette_samples(X, labels)

        # Draw the silhouette figure
        self.draw(labels)

        # Return the estimator
        return self
helpers.py 文件源码 项目:VASC 作者: wang-research 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def clustering( points, k=2,name='kmeans'):
    '''
    points: N_samples * N_features
    k: number of clusters
    '''
    if name == 'kmeans':
        kmeans = KMeans( n_clusters=k,n_init=100 ).fit(points)
        ## print within_variance
        #cluster_distance = kmeans.transform( points )
        #within_variance = sum( np.min(cluster_distance,axis=1) ) / float( points.shape[0] )
        #print("AvgWithinSS:"+str(within_variance))
        if len( np.unique(kmeans.labels_) ) > 1: 
            si = silhouette_score( points,kmeans.labels_ )
            #print("Silhouette:"+str(si))
        else:
            si = 0
            print("Silhouette:"+str(si))
        return kmeans.labels_,si

    if name == 'spec':
        spec= SpectralClustering( n_clusters=k,affinity='cosine' ).fit( points )
        si = silhouette_score( points,spec.labels_ )
        print("Silhouette:"+str(si))
        return spec.labels_,si
Clustering.py 文件源码 项目:GRIPy 作者: giruenf 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def k_means(data, nc, req_info=None):
    means = np.mean(data, axis=0)
    stds = np.std(data, axis=0)

    sdata = (data - means)/stds

    km = KMeans(init='k-means++', n_clusters=nc, n_init=10)
    km.fit(sdata)

    if req_info == 'all':
        req_info = ['silhouette', 'inertia', 'centers']
    elif req_info is None:
        req_info = []

    info = {}

    if 'silhouette' in req_info:
        info['silhouette'] = metrics.silhouette_score(data, km.labels_)
    if 'inertia' in req_info:
        info['inertia'] = km.inertia_
    if 'centers' in req_info:
        info['centers'] = km.cluster_centers_*stds + means

    return km.labels_, info
clusters.py 文件源码 项目:idealoom 作者: conversence 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def internal_silhouette(self, idea_id, base_labels=None):
        labels = self.labels_for_idea(idea_id, True, False, base_labels)
        self.remove_singletons(labels, idea_id)
        idea_post_ids = self.get_posts_of_idea(idea_id)
        if base_labels:
            idea_post_ids = set(idea_post_ids)
            idea_post_ids.update(list(base_labels.keys()))
            idea_post_ids = np.array(list(idea_post_ids))
            idea_post_ids.sort()
        idea_post_ids = np.array(idea_post_ids)
        idea_post_nums = self.post_ids.searchsorted(idea_post_ids)
        # slicing one axis of a time
        # because simultaneous slice interpreted as diagonal
        distances = self.distance_matrix
        sub_distance = distances[idea_post_nums][:, idea_post_nums]
        sub_labels = labels[idea_post_nums]
        if len(set(sub_labels)) < 2:
            return 0
        return metrics.silhouette_score(sub_distance, sub_labels, 'precomputed')
Ex06.py 文件源码 项目:ml-deti 作者: mariolpantunes 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))
dbscan.py 文件源码 项目:lol-category 作者: vonum 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def db_scan(data, eps, min_samples, metric):
  dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(data)
  print 'DBSCAN'
  print metrics.silhouette_score(data, dbscan.labels_)
  print collections.Counter(dbscan.labels_)
  reduced_data = reduce_with_pca(data)
  plot_2d_data(reduced_data, dbscan.labels_)
cluster.py 文件源码 项目:lol-category 作者: vonum 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def mean_shift(data):
  mean_shift = MeanShift(cluster_all=False, n_jobs=1).fit(data)
  print 'Mean Shift'
  print metrics.silhouette_score(data, mean_shift.labels_)
  print collections.Counter(mean_shift.labels_)
cluster.py 文件源码 项目:lol-category 作者: vonum 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def affinity_prop(data):
  af = AffinityPropagation(damping=0.5, convergence_iter=15, affinity='euclidean').fit(data)
  print 'Affinity Propagation'
  print metrics.silhouette_score(data, af.labels_)
  print collections.Counter(af.labels_)

# mean_shift(np.array(data))
# affinity_prop(np.array(data))
cluster2d.py 文件源码 项目:lol-category 作者: vonum 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def cluster2d(data, n_clusters):
  reduced_data = reduce_with_pca(data)

  kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(reduced_data)
  print 'K-Means'
  print collections.Counter(kmeans.labels_)
  print metrics.silhouette_score(data, kmeans.labels_)

  plot_2d_data(reduced_data, kmeans.labels_)
em.py 文件源码 项目:lol-category 作者: vonum 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def em(data):
  gmm = GaussianMixture(
    n_components=6,
    covariance_type="tied"
  ).fit(data)
  predicted_data = gmm.predict(data)

  print collections.Counter(predicted_data)
  print metrics.silhouette_score(data, predicted_data)

  reduced_data = reduce_with_pca(data, 2)
  plot_2d_data(reduced_data, predicted_data)
cluster.py 文件源码 项目:NBAPlayerValue 作者: TWanish 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def kmeans(reduced_data, n_clusters):
    #----Do KMeans clustering and return relevant graphing/performance data
    kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=42)
    kmeans = kmeans.fit(reduced_data)
    sil_score = metrics.silhouette_score(reduced_data, kmeans.labels_, metric='euclidean')

    data_dictionary = {
        "labels": kmeans.labels_,
        "centroids": kmeans.cluster_centers_,
        "silhouette_score": sil_score
    }

    return data_dictionary
cluster.py 文件源码 项目:NBAPlayerValue 作者: TWanish 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def agglom(reduced_data, n_clusters):
    #----Do Agglomerative clustering and return relevant performance data
    clustering = cluster.AgglomerativeClustering(n_clusters = n_clusters)
    clustering = clustering.fit(reduced_data)
    sil_score = metrics.silhouette_score(reduced_data, clustering.labels_, metric='euclidean')

    return {
        "labels":clustering.labels_,
        "silhouette_score": sil_score
        }
cluster.py 文件源码 项目:NBAPlayerValue 作者: TWanish 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def find_best_cluster(cluster_type,data,a,b):
    #----Prints silhouette scores for all # of clusters in range
    scores = []
    for i in range(a,b):

        if cluster_type.lower() == "kmeans":
            i_clusters = kmeans(data, i)
        elif cluster_type.lower() == "agglom":
            i_clusters = agglom(data, i)

        sil_score_i = i_clusters['silhouette_score']
        scores.append(sil_score_i)

    print(scores)
kmeans_cluster.py 文件源码 项目:ParseLawDocuments 作者: FanhuaandLuomu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def clustering(docs,n_clusters):  # ?? n_clusters ???
    kmeans_model=KMeans(n_clusters=n_clusters,random_state=1).fit(docs)  # kmeans??
    labels=kmeans_model.labels_
    # hmodel=AgglomerativeClustering(n_clusters=n_clusters).fit(docs)   # ????
    # labels=hmodel.labels_
    score=metrics.silhouette_score(np.array(docs),labels,metric='euclidean')  #   euclidean  ??
    return labels,score
classifier.py 文件源码 项目:Clustering 作者: Ram81 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def analyze_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data) 
    print(" %9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f"%( name, time()-t0, estimator.inertia_, metrics.homogeneity_score(labels,  estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size = samples) ))
cluster.py 文件源码 项目:hyperstar 作者: nlpub 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def evaluate(k):
    km = kmeans[k]
    score = silhouette_score(train_offsets, km.labels_, metric='euclidean', random_state=RANDOM_SEED)
    print('Silhouette score for k=%d is %f.' % (k, score))
    return (k, score)
tp3_solutions.py 文件源码 项目:TPs 作者: DataMiningP7 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def try_kmeans(X):
    """ Run the K-Means algorithm on X with different values of K, and return
     the one that gives the best score.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
    """
    best_k = 1
    best_score = -1

    for k in range(2, 20+1):
        model = KMeans(n_clusters=k)
        model.fit(X)
        labels = model.predict(X)
        score = silhouette_score(model.transform(X), labels)

        print(k, "->", score)
        if score > best_score:
            best_k = k
            best_score = score

    print("The best K is", best_k)
    return best_k


# Ex3
tp2_solutions.py 文件源码 项目:TPs 作者: DataMiningP7 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def ex2_kmeans(X, y):
    """ Applies the KMeans algorithm on X, y using K=10 and print the
    silhouette score of this model. X and y are returned by transform_text
    above.
    """
    model = KMeans(10).fit(X, y)
    print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_)

# Ex 3
tp2_solutions.py 文件源码 项目:TPs 作者: DataMiningP7 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def ex4_agglomerative_clustering(X, y):
    """ This does the same thing as ex2_kmeans but with an agglomerative
    clustering and K=2.
    """
    # AgglomerativeClustering needs a non-spare matrix
    X = X.toarray()

    k = 2
    model = AgglomerativeClustering(k).fit(X, y)

    print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_)


# Ex 5
solr-similarity.py 文件源码 项目:Solr-ES-Similarity 作者: harsham05 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def sk_kmeans(core): #, kval=3

    solrURL = "http://localhost:8983/solr/" + core
    solrInstance = Solr(solrURL)

    list_of_points = []
    docs = solrInstance.query_iterator(query="*:*", start=0)

    for doc in docs:
        list_of_points.append(Vector(doc['id'], doc))

    list_of_Dicts = (point.features for point in list_of_points)

    df = pd.DataFrame(list_of_Dicts)
    df = df.fillna(0)

    silhouettes = {}
    for k in range(2, 10):

        kmeans = KMeans(n_clusters=k,
                    init='k-means++',
                    max_iter=300,  # k-means convergence
                    n_init=10,  # find global minima
                    n_jobs=-2,  # parallelize
                    )

        labels = kmeans.fit_predict(df)
        silhouettes[k] = silhouette_score(df, labels)


    return str(silhouettes)
compare.py 文件源码 项目:crime_prediction 作者: livenb 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def nmf_test(df):
    X = df.drop(['Year', 'zipcode'], axis=1).values
    scaler = MinMaxScaler()
    X_sca = scaler.fit_tranform(X)
    scores = []
    for k in xrange(2, 11):
        model = NMF(n_components=k)
        W = model.fit_transform(X_sca)
        labels = W.argmax(axis=1)
        score = silhouette_score(X_sca, labels)
        scores.append(score)
    plt.plot(xrange(2, 11), scores, 'b*-')
    plt.show()
cluster_engine.py 文件源码 项目:artorithmia 作者: alichtner 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def silhouette(self):
        """
        Calculate the silhouette score for a certain clustering.

        Input:  None
        Output: silhouette score (None)
        """
        return silhouette_score(self.features, self.cluster_labels)
hgfc.py 文件源码 项目:cluster_paraphrases 作者: acocos 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def h_cluster(wordlist, sims, distmat, thresh=0.01):

    B_, Bs, Ms, Ts, As = hgfc(sims, thresh=thresh)

    sil_coefs = []
    for i,a in enumerate(As):
        l = labels(a)
        if len(set(l)) > 2 and len(set(l)) < len(wordlist)-1:
            sil_coefs.append(silhouette_score(distmat, labels(a), metric='precomputed'))
        else:
            sil_coefs.append(0.0)
    ld = [labeldict(a,wordlist) for a in As]
    return ld, sil_coefs
optmazation.py 文件源码 项目:email-sherlock 作者: jgondin 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def scores(dmat,cluster_labels):
    try:
        silhouette_avg = silhouette_score(dmat, cluster_labels, metric='precomputed', sample_size=100)
        return(silhouette_avg)
    except:
        return(None)
clustering-part.py 文件源码 项目:yelp-contest 作者: AndyFou 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def silhcoeff(data,labels):
    arrdata = array(data)
    print("Silhouette coefficient: ", metrics.silhouette_score(arrdata,labels,metric='euclidean'))

###################################  PHOTOS  ###########################################

# LOAD PHOTOS FROM FOLDER & SAVE IN A LIST [FILENAME,PHOTO,GRAYSCALE_PHOTO]


问题


面经


文章

微信
公众号

扫码关注公众号