python类MiniBatchKMeans()的实例源码

test_silhouette.py 文件源码 项目:yellowbrick 作者: DistrictDataLabs 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def test_integrated_mini_batch_kmeans_silhouette(self):
        """
        Test no exceptions for mini-batch kmeans silhouette visualizer

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X, y = make_blobs(
            n_samples=1000, n_features=12, centers=8, shuffle=True,
        )

        try:
            visualizer = SilhouetteVisualizer(MiniBatchKMeans())
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during silhouette: {}".format(e))
data_utils.py 文件源码 项目:Shoe-Shape-Classifier 作者: jrzaurin 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def perc_var_explained(X,k):
    """
    Compute the percentage of variance explained defined as between sum of squares
    divided but the total sum of squares.
    WARNING: It will take a while.
    The code here can be found "almost" anywhere online.

    Params:
    --------
    X: numpy array with observations and features to be clustered
    k: number of clusters

    Returns:
    --------
    pve: percentage of variance explained
    """

    model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
                          n_init=3, max_no_improvement=10, verbose=0)
    model.fit(X)

    centroids = model.cluster_centers_
    dist_c = cdist(X, centroids, 'euclidean')
    dist   = np.min(dist_c, axis=1)
    tot_withinss = sum(dist**2)
    totss = sum(pdist(X)**2)/X.shape[0]
    betweenss = totss - tot_withinss
    pve = (betweenss/totss  *100)

    return pve
data_utils.py 文件源码 项目:Shoe-Shape-Classifier 作者: jrzaurin 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def bic(X, k):
    """
    Compute the BIC score.
    Implementarion from here:
    http://www.aladdin.cs.cmu.edu/papers/pdfs/y2000/xmeans.pdf
    with corrections from here:
    https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans

    Params:
    --------
    X: numpy array with observations and features to be clustered
    k: number of clusters

    Returns:
    --------
    BIC: bic score
    """

    model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
                          n_init=3, max_no_improvement=10, verbose=0)
    model.fit(X)

    centers = model.cluster_centers_
    centers = np.expand_dims(centers, axis=1)
    labels  = model.labels_
    N_C = np.bincount(labels)
    R, M = X.shape

    wcss = sum([sum(cdist(X[np.where(labels == c)], centers[c], 'euclidean')**2) for c in range(k)])
    var = (1.0/(R-k)/M) * wcss
    const_term = 0.5 * k * np.log(R) * (M+1)

    BIC = np.sum([ ( Rn * np.log(Rn) ) -
                   ( Rn * np.log(R) ) -
                   ( ((Rn * M) / 2) * np.log(2*np.pi*var) )  -
                   ( (Rn - 1) * M/ 2 )
                   for Rn in N_C]) - const_term

    return BIC
test.py 文件源码 项目:kmc2 作者: obachem 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def test_scenarios():
    """Test that everything works"""
    for s in scenarios():
        seeding = kmc2.kmc2(**s)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")  # disable sklearn warnings
            model = MiniBatchKMeans(s["k"], init=seeding).fit(s["X"])
        new_centers = model.cluster_centers_
user_clustering_artist_taset.py 文件源码 项目:aliMusic 作者: wangqingbaidu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def gen_cluster(keys = None, cluster_matrix = None):
    km = MiniBatchKMeans(n_clusters=50, batch_size=1000)
#     km = KMeans(n_jobs=-1, n_clusters=50)
    print "Clustering data..."
    labels = pd.DataFrame(km.fit_predict(cluster_matrix.values))
    res = pd.concat([keys, labels], axis = 1, ignore_index=True)
    return res
gen_clustering_labels.py 文件源码 项目:aliMusic 作者: wangqingbaidu 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def gen_cluster(keys = None, cluster_matrix = None):
    assert cluster_matrix and keys
    km = MiniBatchKMeans(n_clusters=50, batch_size=1000)
    labels = pd.DataFrame(km.fit_predict(cluster_matrix.values))

    res = pd.concat([keys, labels], axis = 1, ignore_index=True)
    return res
vlad.py 文件源码 项目:QScode 作者: PierreHao 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def fit(self, descs, preprocess=True):
        if preprocess:
            self.stdSlr = StandardScaler()
            self.stdSlr.fit(descs)
            tmp = self.stdSlr.transform(descs)
        else:
            tmp = descs
            self.stdSlr = None
        kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.num_clusters, batch_size=10000)
        kmeans.fit(tmp)
        self.centers = kmeans.cluster_centers_
        self.clusters = kmeans.labels_
        return self.centers
bow.py 文件源码 项目:QScode 作者: PierreHao 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def MiniBatchKMeans(self, X, batch=10000):
        print("in fit method", X.shape, self.k)
        kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.k, batch_size=batch)
        kmeans.fit(X)
        centers = kmeans.cluster_centers_
        clusters = kmeans.labels_
        print("shape of centers is ", centers.shape)
        return centers
net.py 文件源码 项目:cvpr17-dvsq 作者: caoyue10 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def initial_centers(self, img_output):
        C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
        print "#DVSQ train# initilizing Centers"
        all_output = img_output
        for i in xrange(self.subspace_num):
            kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
            C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
            print "step: ", i, " finish"
        return C_init
net_val.py 文件源码 项目:cvpr17-dvsq 作者: caoyue10 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def initial_centers(self, img_output):
        C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
        print "#ZDQ train# initilizing Centers"
        all_output = img_output
        for i in xrange(self.subspace_num):
            kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
            C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
            print "step: ", i, " finish"
        return C_init
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_mb_k_means_plus_plus_init_dense_array():
    mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
                                 random_state=42)
    mb_k_means.fit(X)
    _check_fitted_model(mb_k_means)
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def test_mb_kmeans_verbose():
    mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
                                 random_state=42, verbose=1)
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        mb_k_means.fit(X)
    finally:
        sys.stdout = old_stdout
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_mb_k_means_plus_plus_init_sparse_matrix():
    mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
                                 random_state=42)
    mb_k_means.fit(X_csr)
    _check_fitted_model(mb_k_means)
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_minibatch_init_with_large_k():
    mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
    # Check that a warning is raised, as the number clusters is larger
    # than the init_size
    assert_warns(RuntimeWarning, mb_k_means.fit, X)
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_minibatch_k_means_random_init_sparse_csr():
    # increase n_init to make random init stable enough
    mb_k_means = MiniBatchKMeans(init="random", n_clusters=n_clusters,
                                 random_state=42, n_init=10).fit(X_csr)
    _check_fitted_model(mb_k_means)
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_minibatch_k_means_perfect_init_dense_array():
    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                 random_state=42, n_init=1).fit(X)
    _check_fitted_model(mb_k_means)
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers():
    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                 random_state=42, n_init=10)
    assert_warns(RuntimeWarning, mb_k_means.fit, X)
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_minibatch_k_means_perfect_init_sparse_csr():
    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                 random_state=42, n_init=1).fit(X_csr)
    _check_fitted_model(mb_k_means)
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_minibatch_with_many_reassignments():
    # Test for the case that the number of clusters to reassign is bigger
    # than the batch_size
    n_samples = 550
    rnd = np.random.RandomState(42)
    X = rnd.uniform(size=(n_samples, 10))
    # Check that the fit works if n_clusters is bigger than the batch_size.
    # Run the test with 550 clusters and 550 samples, because it turned out
    # that this values ensure that the number of clusters to reassign
    # is always bigger than the batch_size
    n_clusters = 550
    MiniBatchKMeans(n_clusters=n_clusters,
                    batch_size=100,
                    init_size=n_samples,
                    random_state=42).fit(X)
test_k_means.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_minibatch_default_init_size():
    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                 batch_size=10, random_state=42,
                                 n_init=1).fit(X)
    assert_equal(mb_k_means.init_size_, 3 * mb_k_means.batch_size)
    _check_fitted_model(mb_k_means)


问题


面经


文章

微信
公众号

扫码关注公众号