python类KMeans()的实例源码

freesuc.py 文件源码 项目:pyhiro 作者: wanweiwei07 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def clusterFacetSamplesKNN(self, reduceRatio=3, maxNPnts=5):
        """
        cluster the samples of each facet using k nearest neighbors
        the cluster center and their correspondent normals will be saved
        in self.objsamplepnts_refcls and self.objsamplenrmals_refcls

        :param: reduceRatio: the ratio of points to reduce
        :param: maxNPnts: the maximum number of points on a facet
        :return: None

        author: weiwei
        date: 20161129, tsukuba
        """

        self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
        self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
        for i, facet in enumerate(self.facets):
            self.objsamplepnts_refcls[i] = np.empty(shape=(0,0))
            self.objsamplenrmls_refcls[i] = np.empty(shape=(0,0))
            X = self.objsamplepnts_ref[i]
            nX = X.shape[0]
            if nX > reduceRatio:
                kmeans = KMeans(n_clusters=maxNPnts if nX/reduceRatio>maxNPnts else nX/reduceRatio, random_state=0).fit(X)
                self.objsamplepnts_refcls[i] = kmeans.cluster_centers_
                self.objsamplenrmls_refcls[i] = np.tile(self.facetnormals[i], [self.objsamplepnts_refcls.shape[0],1])
kadist-tag-cluster.py 文件源码 项目:wordnet-clusters 作者: darenr 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def word_cluster(data, labels, k):
    k_means = cluster.KMeans(n_clusters=k)
    k_means.fit(data)
    for i, label in enumerate(labels):
        print label, k_means.labels_[i]

    d = defaultdict(list)
    for c, l in zip(k_means.labels_, labels):
        d['cluster' + str(c)].append(l.name())
    fname = 'results/clusters'
    if use_wordnet:
        fname += "_wn"
    if use_wordvectors:
        fname += "_wv"
    fname += '_k' + str(k) + '.json'
    with codecs.open(fname, 'wb', 'utf-8') as outfile:
        outfile.write(json.dumps(d, indent=True))
        print ' * Saved results to', fname
        # create histogram of cluster sizes
        histogram(d)
baselines.py 文件源码 项目:context_predictive_words 作者: Cogitans 项目源码 文件源码 阅读 65 收藏 0 点赞 0 评论 0
def KMeansAccuracy():
    clusterer = KMeans(n_clusters=2, n_init=30)
    tdm = pickle.load(open(DATASET_PATH + "BOW.p", "rb"))
    predictions = clusterer.fit_predict(tdm)
    true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0]
    numerical_mapped_1 = [0 if i == "Israeli" else 1 for i in true_labels]
    numerical_mapped_2 = [1 if i == "Israeli" else 0 for i in true_labels]
    one = f1_score(numerical_mapped_1, predictions)
    two = f1_score(numerical_mapped_2, predictions)
    print("The F1 score of KMeans on BOW is: " + str(max(one, two)))

    clusterer = KMeans(n_clusters=2, n_init=30)
    predictions = clusterer.fit_predict(tdm)
    true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0]
    accuracy = predict_accuracy(true_labels, predictions)
    print("The F1 score of KMeans on BOW (w/Tdidf) is: " + accuracy)
pixel_sampling.py 文件源码 项目:kaggle-yelp-restaurant-photo-classification 作者: u1234x1234 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def learn_color_clusters():
    samples = np.zeros((0, 3))
    cnt = 0
    with open('train_list') as f:
        for line in f:
            line = line[:-1]
            image = cv2.imread(line)
            image = cv2.resize(image, (100, 100))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)

            points = image.reshape((-1, 3))
            np.random.permutation(points.shape[0])
            samples = np.vstack([samples, points[:50]])

            print(samples.shape)
            cnt = cnt + 1
            if cnt % 10000 == 0:
                break

    km = cluster.KMeans(n_clusters=50, n_jobs=-1)
    km.fit(samples)
    np.save('lab_clusters.npy', km.cluster_centers_)
    return

#learn_color_clusters()
test_types.py 文件源码 项目:yellowbrick 作者: DistrictDataLabs 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def test_estimator_instance(self):
        """
        Test that isestimator works for instances
        """

        models = (
            LinearRegression(),
            LogisticRegression(),
            KMeans(),
            LSHForest(),
            PCA(),
            RidgeCV(),
            LassoCV(),
            RandomForestClassifier(),
        )

        for model in models:
            self.assertTrue(isestimator(model))
test_types.py 文件源码 项目:yellowbrick 作者: DistrictDataLabs 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_estimator_class(self):
        """
        Test that isestimator works for classes
        """
        models = (
            LinearRegression,
            LogisticRegression,
            KMeans,
            LSHForest,
            PCA,
            RidgeCV,
            LassoCV,
            RandomForestClassifier,
        )

        for model in models:
            self.assertTrue(inspect.isclass(model))
            self.assertTrue(isestimator(model))
graphcluster.py 文件源码 项目:news-shot-classification 作者: gshruti95 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_cluster_threshold(weights):

    estimator = KMeans(n_clusters = 2)
    data = np.asarray(weights)
    data = data.reshape(-1,1)
    # print data 
    clusters_idx = estimator.fit_predict(data)
    max_idx = data.argmax()
    max_cluster = clusters_idx[max_idx]
    #print max_cluster
    low_cluster = []
    if max_cluster == 1:
        indices = np.argwhere(clusters_idx == 0)
        for idx in indices:
            low_cluster.append(data[idx])
        threshold = max(low_cluster)
        threshold = threshold[0][0]
    else:
        indices = np.argwhere(clusters_idx == 1)
        for idx in indices:
            low_cluster.append(data[idx])
        threshold = max(low_cluster)
        threshold = threshold[0][0]
    # print threshold
    return threshold
clastering.py 文件源码 项目:recommendation_hybrid 作者: nsmalimov 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def make_clast_books(dict_books_all, array_books_real):
    dict_books_clasters = {}

    for i in array_books_real:
        try:
            dict_books_clasters[i] = dict_books_all[i]
        except:
            dict_books_clasters[i] = [1, 1, 1, 1]

    X_array = dict_books_clasters.values()

    num_clusters = len(X_array) / 50

    k_means = cluster.KMeans(n_clusters=num_clusters)
    k_means.fit(X_array)
    # ????? ?????????? ????? ????????
    clusterized_array = list(k_means.labels_)

    for index, i in enumerate(dict_books_clasters.keys()):
        dict_books_clasters[i] = clusterized_array[index]

    return dict_books_clasters, num_clusters
represent.py 文件源码 项目:betasqaud 作者: AJacobs15 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, league_df):

        stat_matrix = []
        for i in range(len(league_df)):
            stat = make_stat_vector(i, league_df)
            stat_matrix.append(stat)

        kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
        kmeans.fit(stat_matrix)

        centroid_array = kmeans.cluster_centers_

        positions = kmeans.predict(stat_matrix)

        league_df['vector'] = pd.Series(stat_matrix, index = league_df.index)
        league_df['position'] = pd.Series(positions, index = league_df.index)



        self.df = league_df
        self.centroids = kmeans.cluster_centers_

        self.map = make_position_map(centroid_array)
represent.py 文件源码 项目:betasqaud 作者: AJacobs15 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, league_df):

        stat_matrix = []
        for i in range(len(league_df)):
            stat = make_stat_vector(i, league_df)
            stat_matrix.append(stat)

        kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
        kmeans.fit(stat_matrix)

        centroid_array = kmeans.cluster_centers_

        positions = kmeans.predict(stat_matrix)

        league_df['vector'] = pd.Series(stat_matrix, index = league_df.index)
        league_df['position'] = pd.Series(positions, index = league_df.index)



        self.df = league_df
        self.centroids = kmeans.cluster_centers_

        self.map = make_position_map(centroid_array)
ProductQuant.py 文件源码 项目:LearnHash 作者: galad-loth 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def PQTrain(data, lenSubVec,numSubCenter):
    (dataSize, dataDim)=data.shape
    if 0!=dataDim%lenSubVec:
        print "Cannot partition the feature space with the given segment number"
        return
    numSubVec=dataDim/lenSubVec
    centers=npy.zeros((numSubVec*numSubCenter,lenSubVec),dtype=npy.float32)
    distOfCenters=npy.zeros((numSubCenter,numSubCenter,numSubVec),dtype=npy.float32)
    objKmeans=KMeans(numSubCenter,'k-means++',3,100,0.001)
    for ii in range(numSubVec):
        print("PQ training. Processing "+str(ii)+"-th sub-vector")
        objKmeans.fit(data[:,ii*lenSubVec:(ii+1)*lenSubVec]) 
        centers[ii*numSubCenter:(ii+1)*numSubCenter,:]= objKmeans.cluster_centers_
        distOfCenters[:,:,ii]=squareform(pdist(objKmeans.cluster_centers_,metric="euclidean"))
    model={"centers":centers,"distOfCenters":distOfCenters}   
    return model
ProductQuant.py 文件源码 项目:LearnHash 作者: galad-loth 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def PQEval(data,lenSubVec,numSubCenter,centersPQ):
    (dataSize, dataDim)=data.shape
    if 0!=dataDim%lenSubVec:
        print "Cannot partition the feature space with the given segment number"
        return
    numSubVec=dataDim/lenSubVec
    codePQ=-npy.ones((dataSize, numSubVec),dtype=npy.int32)
    objKmeans=KMeans(numSubCenter)
    if (centersPQ.shape[0]!=numSubVec*numSubCenter 
        or centersPQ.shape[1]!=lenSubVec):
        print "PQ model dimension is not compatible with input data"
        return
    for ii in range(numSubVec):
        objKmeans.cluster_centers_=centersPQ[ii*numSubCenter:(ii+1)*numSubCenter,:]
        codePQ[:,ii]=objKmeans.predict(data[:,ii*lenSubVec:(ii+1)*lenSubVec])
    return codePQ
kmeans.py 文件源码 项目:crankshaft 作者: CartoDB 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def spatial(self, query, no_clusters, no_init=20):
        """
            find centers based on clusters of latitude/longitude pairs
            query: SQL query that has a WGS84 geometry (the_geom)
        """
        params = {"subquery": query,
                  "geom_col": "the_geom",
                  "id_col": "cartodb_id"}

        data = self.data_provider.get_spatial_kmeans(params)

        # Unpack query response
        xs = data[0]['xs']
        ys = data[0]['ys']
        ids = data[0]['ids']

        km = KMeans(n_clusters=no_clusters, n_init=no_init)
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)
kmeans.py 文件源码 项目:crankshaft 作者: CartoDB 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def spatial(self, query, no_clusters, no_init=20):
        """
            find centers based on clusters of latitude/longitude pairs
            query: SQL query that has a WGS84 geometry (the_geom)
        """
        params = {"subquery": query,
                  "geom_col": "the_geom",
                  "id_col": "cartodb_id"}

        data = self.data_provider.get_spatial_kmeans(params)

        # Unpack query response
        xs = data[0]['xs']
        ys = data[0]['ys']
        ids = data[0]['ids']

        km = KMeans(n_clusters=no_clusters, n_init=no_init)
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)
stats.py 文件源码 项目:cellranger 作者: 10XGenomics 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def compute_readpairs_per_umi_threshold(reads, subsample_rate):
    ''' Compute a threshold above which the UMIs are unlikely to be PCR off-products.
        reads (np.array(int)) - Read pairs for each UMI
        subsample_rate (float) - Subsample reads to this fraction.
        Returns threshold (int) - The RPPU threshold in the subsampled space '''

    if len(np.unique(reads)) < 2:
        print 'Skipping RPPU threshold calculation.'
        return 1

    print 'RPPU subsample rate: %0.4f' % subsample_rate

    reads = np.random.binomial(reads, subsample_rate)
    reads = reads[reads > 0]

    if len(np.unique(reads)) < 2:
        print 'Subsampling gave a degenerate distribution of RPPU. Skipping RPPU threshold calculation.'
        return 1

    new_n50 = tk_stats.NX(reads, 0.5)

    print 'New N50: %d:' % new_n50

    # Log-transform counts
    log_reads = np.log(reads)

    # Run K-Means. Reshape necessary because kmeans takes a matrix.
    kmeans = sk_cluster.KMeans(2).fit(log_reads.reshape((-1,1)))
    kmeans.predict(log_reads.reshape((-1,1)))

    # Take the cluster with the smallest mean
    min_cluster = np.argsort(np.ravel(kmeans.cluster_centers_))[0]

    print 'RPPU component means: ' + str(list(iter(np.exp(kmeans.cluster_centers_))))
    print 'RPPU component members: ' + str(np.bincount(kmeans.labels_))

    # Take the max element in the min-cluster
    threshold = np.max(reads[kmeans.labels_ == min_cluster])

    return threshold
meal_price_outlier_classifier.py 文件源码 项目:rosie 作者: datasciencebr 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def fit(self, X):
        _X = X[self.__applicable_rows(X)]
        companies = _X.groupby('recipient_id').apply(self.__company_stats) \
            .reset_index()
        companies = companies[self.__applicable_company_rows(companies)]

        self.cluster_model = KMeans(n_clusters=3)
        self.cluster_model.fit(companies[self.CLUSTER_KEYS])
        companies['cluster'] = self.cluster_model.predict(companies[self.CLUSTER_KEYS])
        self.clusters = companies.groupby('cluster') \
            .apply(self.__cluster_stats) \
            .reset_index()
        self.clusters['threshold'] = \
            self.clusters['mean'] + 4 * self.clusters['std']
        return self
clustering.py 文件源码 项目:SnapStitch 作者: avikj 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_clusters_from_frames(frame_dir=None):

  # TODO: allow multiple frame directories to be processed at once
  if frame_dir is None:
    filename_to_embedding = pickle.load(open('temp/temp_vid1_290717183249/filename_to_emb.pkl')) # TODO: call get_inception_embeddings on frame dir, but for now just use the pickle
    embs = []
    filenames = []
    for filename, embedding in filename_to_embedding.iteritems():
      embs.append(embedding)
      filenames.append(filename)
    filenames = [filename[filename.rindex('/')+1:] for filename in filenames]
    embs = np.array(embs)
    candidates = [(11, 6)]
    candidates = [(eps, min_pts) for eps in range(7, 15) for min_pts in range(2, 10)]
    labels = cluster(embs, filenames, algorithm='KMeans', n_clusters=6)
utils.py 文件源码 项目:cg 作者: michaelhabeck 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def kmeans(X, K):
        km = KMeans(K).fit(X)
        return km.cluster_centers_
clustering.py 文件源码 项目:onionstack 作者: ntddk 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def main():
    features = []

    for i in list:
        im = cv2.imread(i)
        hist, bins = np.histogram(im.ravel(), 256, [0, 256])
        features.append(hist)

    lsa = TruncatedSVD(10)
    features = lsa.fit_transform(features)
    features = Normalizer(copy = False).fit_transform(features)

    km = KMeans(
        init='k-means++',
        n_clusters=n_clusters,
    )
    km.fit(features)

    for i in range(n_clusters):
        if not os.path.exists('./result/' + str(i)):
            os.makedirs('./result/' + str(i))

    cnt = 0

    for i in list:
        filename = i.split('/')[-1]
        print filename,
        print km.labels_[cnt]
        shutil.copyfile(i, './result/' +  str(km.labels_[cnt]) + '/' + filename)
        cnt += 1
discretizer.py 文件源码 项目:dsbox-cleaning 作者: usc-isi-i2 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def _discretize_by_kmeans(col, num_bins, random_state):
    nan_idx = col[col.isnull()].index
    kmeans = KMeans(n_clusters=num_bins, random_state=random_state)
    kmeans = kmeans.fit(col.dropna().values.T.reshape(-1, 1))
    group = kmeans.labels_
    if col.isnull().sum() > 0:
        group = group.astype(float)
        for idx in nan_idx:
            group = np.insert(group,idx,np.nan)
    return pd.Series(group)


问题


面经


文章

微信
公众号

扫码关注公众号