python类MiniBatchKMeans()的实例源码-面圈网

gif.py 文件源码项目：gif-enc 作者: DavidBuchanan314 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def palettise(data, n_entries=256):
    height = len(data)
    width = len(data[0])
    all_colours = sum(data, [])
    print("Calculating pallete...")
    kmeans = MiniBatchKMeans(n_clusters=n_entries, random_state=0).fit(all_colours)
    pallete = [list(map(int, rgb)) for rgb in kmeans.cluster_centers_]

    print("Dithering...") # Floyd–Steinberg dithering
    for y in range(height):
        print("\r{:.1f}%".format((y/height)*100), end="")
        for x in range(width):
            bucket = kmeans.predict([data[y][x]])[0]
            error = [a-b for a, b in zip(data[y][x], pallete[bucket])]
            data[y][x] = bucket
            for dx, dy, coef in [(1, 0, 7/16), (-1, 1, 3/16), (0, 1, 5/16), (1, 1, 1/16)]:
                xn = x + dx
                yn = y + dy
                if ( 0 <= xn < width and 0 <= yn < height ):
                    data[yn][xn] = [a+b*coef for a, b in zip(data[yn][xn], error)]

    print("\r100%     ")
    return data, pallete

cluster.py 文件源码项目：FreeDiscovery 作者: FreeDiscovery 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def k_means(self, n_clusters, batch_size=1000):
        """
        Perform K-mean clustering

        Parameters
        ----------
        n_clusters : int
           number of clusters
        batch_size : int
           the bath size for the MiniBatchKMeans algorithm
        """
        from sklearn.cluster import MiniBatchKMeans
        pars = {"batch_size": batch_size, 'is_hierarchical': False,
                "metric": self.metric}
        km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
                             n_init=10,
                             init_size=batch_size, batch_size=batch_size)
        return self._cluster_func(n_clusters, km, pars)

kmeans.py 文件源码项目：elm 作者: ContinuumIO 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def kmeans_aic(model, X, **kwargs):
    '''AIC (Akaike Information Criterion) for k-means for model selection

    Parameters:
        :model:  An elm.pipeline.Pipeline with KMeans or MiniBatchKMeans as final step in Pipeline
        :X:      The X data that were just given to "fit", or "partial_fit"
        :kwargs: placeholder - ignored

    Returns:
        :AIC: float

    '''

    k, m = model._estimator.cluster_centers_.shape
    if isinstance(X, xr.DataArray):
        n = X.flat.values.shape[0]
    else:
        n = X.shape[0]
    d = model._estimator.inertia_
    aic =  d + 2 * m * k
    delattr(model._estimator, 'labels_')
    return aic

test_base.py 文件源码项目：yellowbrick 作者: DistrictDataLabs 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def test_clusterer_enforcement(self):
        """
        Assert that only clustering estimators can be passed to cluster viz
        """
        nomodels = [
            SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier
        ]

        for nomodel in nomodels:
            with self.assertRaises(YellowbrickTypeError):
                visualizer = ClusteringScoreVisualizer(nomodel())

        models = [
            KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch
        ]

        for model in models:
            try:
                visualizer = ClusteringScoreVisualizer(model())
            except YellowbrickTypeError:
                self.fail("could not pass clustering estimator to visualizer")

data_utils.py 文件源码项目：Shoe-Shape-Classifier 作者: jrzaurin 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def avg_within_ss(X, k):
    """
    Compute the average within-cluster sum of squares. The code here can be
    found "almost" anywhere online

    Params:
    --------
    X: numpy array with observations and features to be clustered
    k: number of clusters

    Returns:
    --------
    avgwithinss: average within-cluster sum of squares
    """

    model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
                          n_init=3, max_no_improvement=10, verbose=0)
    model.fit(X)

    centroids = model.cluster_centers_
    dist_c = cdist(X, centroids, 'euclidean')
    dist   = np.min(dist_c, axis=1)
    avgwithinss = sum(dist**2)/X.shape[0]

    return avgwithinss

bow.py 文件源码项目：QScode 作者: PierreHao 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def fit(self, descs, MiniBatchKMeans=True, batch_size=10000, preprocess=False):
        """Training"""
        """
        if preprocess:
            self.stdSlr = StandardScaler().fit(descs)
            descs = self.stdSlr.transform(descs)
        else:
            self.stdSlr = None
        """
        if MiniBatchKMeans:
            self.centers = self.MiniBatchKMeans(descs, batch_size)
        else:
            self.centers = self.Kmeans(descs)
        if preprocess:
            self.stdSlr = StandardScaler().fit(descs)
            #descs = self.stdSlr.transform(descs)
        else:
            self.stdSlr = None
        return self.centers

test_k_means.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def test_k_means_explicit_init_shape():
    # test for sensible errors when giving explicit init
    # with wrong number of features or clusters
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(40, 3))
    for Class in [KMeans, MiniBatchKMeans]:
        # mismatch of number of features
        km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
        msg = "does not match the number of features of the data"
        assert_raises_regex(ValueError, msg, km.fit, X)
        # for callable init
        km = Class(n_init=1, init=lambda X_, k, random_state: X_[:, :2], n_clusters=len(X))
        assert_raises_regex(ValueError, msg, km.fit, X)
        # mismatch of number of clusters
        msg = "does not match the number of clusters"
        km = Class(n_init=1, init=X[:2, :], n_clusters=3)
        assert_raises_regex(ValueError, msg, km.fit, X)
        # for callable init
        km = Class(n_init=1, init=lambda X_, k, random_state: X_[:2, :], n_clusters=3)
        assert_raises_regex(ValueError, msg, km.fit, X)

test_k_means.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
                                 random_state=42, init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

test_k_means.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def test_sparse_mb_k_means_callable_init():

    def test_init(X, k, random_state):
        return centers

    # Small test to check that giving the wrong number of centers
    # raises a meaningful error
    msg = "does not match the number of clusters"
    assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
                                                         random_state=42).fit,
                        X_csr)

    # Now check that the fit actually works
    mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
                                 random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means)

test_k_means.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def test_mini_batch_k_means_random_init_partial_fit():
    km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)

    # use the partial_fit API for online learning
    for X_minibatch in np.array_split(X, 10):
        km.partial_fit(X_minibatch)

    # compute the labeling on the complete dataset
    labels = km.predict(X)
    assert_equal(v_measure_score(true_labels, labels), 1.0)

test_estimators.py 文件源码项目：nlp-playground 作者: jamesmishra 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test__ClusteringWithSupervision_clusters():
    """
    Check that we change the number of clusters properly.

    We have a weird interface here where we sort of overload
    `n_clusters` but try to hide it.
    """
    train, classes = make_X_y()
    model = ClusteringWithSupervision(cluster_instance=MiniBatchKMeans())
    assert model.n_clusters is None
    assert model.get_params()['n_clusters'] is None
    assert model.cluster_instance.n_clusters == 8
    assert model._cluster_instance is None
    model.fit(train, classes)
    assert model.n_clusters is None
    assert model.get_params()['n_clusters'] is None
    assert model.cluster_instance.n_clusters == 8
    assert model._cluster_instance.n_clusters == 4

bow_utils.py 文件源码项目：pybot 作者: spillai 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def bow_codebook(data, K=64): 
    km = MiniBatchKMeans(n_clusters=K, init='k-means++', 
                         compute_labels=False, batch_size=1000, max_iter=150, max_no_improvement=30, 
                         verbose=False).fit(data)
    return km.cluster_centers_

test_minibatch.py 文件源码项目：dask-ml 作者: dask 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_basic(self, single_chunk_blobs):
        X, y = single_chunk_blobs
        a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0)
        b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0)
        a.fit(X)
        b.partial_fit(X)
        assert_estimator_equal(a, b, exclude=['random_state_'])

sklearn_basic.py 文件源码项目：base_function 作者: Rockyzsu 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def mini_batch(fig):
    global X_iris, geo
    ax = fig.add_subplot(geo + 2, projection='3d', title='mini-batch')
    mini_batch = cluster.MiniBatchKMeans(init='random', n_clusters=3)
    mini_batch.fit(X_iris)
    res = mini_batch.labels_
    for n, i in enumerate(X_iris):
        ax.scatter(*i[: 3], c='bgrcmyk'[res[n] % 7], marker='o')

    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    return res

new.py 文件源码项目：PPRE 作者: MaoYuwei 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def train(X, y, true_k=50, minibatch=False, showLable=True):
    # ??????????????k-means?
    fout = open('pro1_cluster.txt', 'w+')
    if minibatch:
        km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                             init_size=1000, batch_size=1000, verbose=False)
    else:
        km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
                    verbose=False)
    km.fit(X)
    print y.dtype
    if showLable:
        print("Top terms per cluster:")
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        terms = y
        # print y
        result = list(km.predict(X))
        print('Cluster distribution:')
        print(dict([(i, result.count(i)) for i in result]))
        cluster_list = {}
        for i in range(true_k):
            cluster_list[i] = []
        for j in range(len(result)):
            # print terms[j]
            # print result[j]
            cluster_list[result[j]].append([terms[j], X[j]])
        for i in cluster_list.keys():
            cluster = cluster_list[i]
            if len(cluster) > 0:
                for bet in cluster:
                    vec = bet[1].tolist()
                    # fout.write(bet[0] + str(vec) + '\n')
                    # print bet
                    fout.write(bet[0] + '\n')
                fout.write('-------------------\n')

    return -km.score(X)
    fout.close()

clustering.py 文件源码项目：oss-github-analysis-project 作者: itu-oss-project-team 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def minibatchs_k_means_clustering(self, out_path, pd_data, number_of_clusters):
        headers, repos, features = self.__fetch_data(pd_data)

        mb_kmeans = MiniBatchKMeans(n_clusters=number_of_clusters)
        mb_kmeans.fit(features)

        clusters = []
        for i in range(0, number_of_clusters): # k cluster
            repo_list = []
            for j in range (0, len(mb_kmeans.labels_)):  # a label for each repo.
                if i == mb_kmeans.labels_[j]:  # if repo label is equal to Cluster number
                    repo_list.append(repos[j])  # add repo to cluster i's list.
            clusters.append(repo_list)
        out_file_path = os.path.join(out_path, "mb_kmeans_noOfClusters" + str(number_of_clusters))
        self.__export_k_means_results(mb_kmeans, headers, clusters, out_file_path)  # avoid ".csv"

vlad.py 文件源码项目：feature-aggregation 作者: paschalidoud 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self, n_codewords, normalization=3, inner_batch=128,
                 dimension_ordering="tf"):
        self.n_codewords = n_codewords
        self.inner_batch = inner_batch
        self.normalization = normalization

        self._clusterer = cluster.MiniBatchKMeans(
            n_clusters=self.n_codewords,
            n_init=1,
            compute_labels=False
        )

        super(self.__class__, self).__init__(dimension_ordering)

llc.py 文件源码项目：feature-aggregation 作者: paschalidoud 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def __init__(self, n_codewords, neighbors=5, beta=1e-4, dimension_ordering="tf"):
        self.n_codewords = n_codewords
        self.neighbors = neighbors
        self.beta = beta
        self._clusterer = cluster.MiniBatchKMeans(
            n_clusters=self.n_codewords,
            n_init=1,
            compute_labels=False
        )

        super(self.__class__, self).__init__(dimension_ordering)

bow.py 文件源码项目：feature-aggregation 作者: paschalidoud 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def __init__(self, n_codewords, l1_norm=True, dimension_ordering="tf"):
        self.n_codewords = n_codewords
        self.l1_norm = l1_norm
        self._clusterer = cluster.MiniBatchKMeans(
            n_clusters=self.n_codewords,
            n_init=1,
            compute_labels=False
        )

        super(self.__class__, self).__init__(dimension_ordering)

dvsq.py 文件源码项目：deephash 作者: caoyue10 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def initial_centers(self, img_output):
        C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
        print "#DVSQ train# initilizing Centers"
        all_output = img_output
        for i in xrange(self.subspace_num):
            kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
            C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
            print "step: ", i, " finish"
        return C_init

Gap_stats.py 文件源码项目：PySCUBA 作者: GGiecold 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def KMEANS(data, k):

    if data.shape[0] < 20000:
        centroids, cluster_IDs, _ = k_means(data, k, init = 'k-means++', precompute_distances = 'auto', n_init = 20, max_iter = 200)
    else:
        mbkm = MiniBatchKMeans(k, 'k-means++', max_iter = 100, batch_size = data.shape[0] / k, n_init = 20)
        mbkm.fit(data)

        centroids = mbkm.cluster_centers_
        cluster_IDs = mbkm.labels_

    return centroids, cluster_IDs

api_example_evo.py 文件源码项目：elm 作者: ContinuumIO 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def make_example_y_data(X, y=None, sample_weight=None, **kwargs):
    fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values)
    y = fitted.predict(X.flat.values)
    return (X, y, sample_weight)

cdq.py 文件源码项目：aaai17-cdq 作者: caoyue10 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def initial_centers(self, img_output, txt_output):
        C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
        print "#cdq train# initilizing Centers"
        all_output = np.vstack([img_output, txt_output])
        for i in xrange(self.subspace_num):
            kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
            C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
            print "step: ", i, " finish"
        return C_init

sift_svm.py 文件源码项目：TFFRCNN 作者: InterVideo 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def _kmeans_clustering(self, X, n_clusters, batch_size=128):
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size,
                                 n_init=10, max_no_improvement=10)
        kmeans.fit(X)
        return kmeans.cluster_centers_

sift_svm.py 文件源码项目：TFFRCNN 作者: InterVideo 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def generate_codebook(image, detectAndCompute=SIFT_create().detectAndCompute):
    descriptors = detectAndCompute(image, window_size=None)
    kmeans = MiniBatchKMeans(n_clusters=2048, batch_size=128,
                             n_init=10, max_no_improvement=10)
    kmeans.fit(descriptors)
    codebook = kmeans.cluster_centers_[:]
    return codebook

bow.py 文件源码项目：mmfeat 作者: douwekiela 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def cluster(self):
        mbk = MiniBatchKMeans(n_clusters=self.K, batch_size=self.K*2, verbose=self.verbose, compute_labels=False)
        if self.subsample is None:
            data = np.vstack([self.data[k] for k in self.data.keys() if self.data[k] is not None])
            mbk.fit(data)
        else: # sample number of files
            fnames = self.data.keys()
            subset = random.sample(fnames, int(self.subsample * len(fnames)))
            subdata = np.vstack([self.data[k] for k in subset if self.data[k] is not None])
            mbk.fit(subdata)
        return mbk.cluster_centers_

test_utils.py 文件源码项目：histonets-cv 作者: sul-cidr 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def test_kmeans(self):
        n_clusters = 5
        X, y = make_blobs(n_samples=1000, centers=n_clusters, random_state=0)
        centers, labels = utils.kmeans(X, n_clusters)
        clf = MiniBatchKMeans(n_clusters=n_clusters)
        assert len(labels) == len(clf.fit_predict(X))
        assert len(centers) == len(clf.cluster_centers_)

utils.py 文件源码项目：histonets-cv 作者: sul-cidr 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def kmeans(X, n_clusters, **kwargs):
    """Classify vectors in X using K-Means algorithm with n_clusters.
    Arguments in kwargs are passed to scikit-learn MiniBatchKMeans.
    Returns a tuple of cluster centers and predicted labels."""
    clf = MiniBatchKMeans(n_clusters=n_clusters, **kwargs)
    labels = clf.fit_predict(X)
    centers = clf.cluster_centers_.astype(np.ubyte)
    return centers, labels

clustering_sklearn.py 文件源码项目：BugClustering 作者: w-garcia 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def kmeans_classifier(prediction, ticket_predict_weights, ticket_target_list, tickets_to_weights_matrix):
    kmeans = MiniBatchKMeans(n_clusters=len(ticket_target_list), init_size=len(tickets_to_weights_matrix) + 1)
    kmeans.fit(tickets_to_weights_matrix)

    predicted_class = kmeans.predict(ticket_predict_weights)[0]
    print "kmeans prediction: {}".format(ticket_target_list[predicted_class])
    if prediction is not None:
        prediction.append([ticket_target_list[predicted_class]])

test_elbow.py 文件源码项目：yellowbrick 作者: DistrictDataLabs 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def test_integrated_mini_batch_kmeans_elbow(self):
        """
        Test no exceptions for mini-batch kmeans k-elbow visualizer

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X,y = make_blobs(
            n_samples=1000, n_features=12, centers=6, shuffle=True
        )

        try:
            visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4)
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during k-elbow: {}".format(e))