def clusterFacetSamplesKNN(self, reduceRatio=3, maxNPnts=5):
"""
cluster the samples of each facet using k nearest neighbors
the cluster center and their correspondent normals will be saved
in self.objsamplepnts_refcls and self.objsamplenrmals_refcls
:param: reduceRatio: the ratio of points to reduce
:param: maxNPnts: the maximum number of points on a facet
:return: None
author: weiwei
date: 20161129, tsukuba
"""
self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
for i, facet in enumerate(self.facets):
self.objsamplepnts_refcls[i] = np.empty(shape=(0,0))
self.objsamplenrmls_refcls[i] = np.empty(shape=(0,0))
X = self.objsamplepnts_ref[i]
nX = X.shape[0]
if nX > reduceRatio:
kmeans = KMeans(n_clusters=maxNPnts if nX/reduceRatio>maxNPnts else nX/reduceRatio, random_state=0).fit(X)
self.objsamplepnts_refcls[i] = kmeans.cluster_centers_
self.objsamplenrmls_refcls[i] = np.tile(self.facetnormals[i], [self.objsamplepnts_refcls.shape[0],1])
python类KMeans()的实例源码
def word_cluster(data, labels, k):
k_means = cluster.KMeans(n_clusters=k)
k_means.fit(data)
for i, label in enumerate(labels):
print label, k_means.labels_[i]
d = defaultdict(list)
for c, l in zip(k_means.labels_, labels):
d['cluster' + str(c)].append(l.name())
fname = 'results/clusters'
if use_wordnet:
fname += "_wn"
if use_wordvectors:
fname += "_wv"
fname += '_k' + str(k) + '.json'
with codecs.open(fname, 'wb', 'utf-8') as outfile:
outfile.write(json.dumps(d, indent=True))
print ' * Saved results to', fname
# create histogram of cluster sizes
histogram(d)
def KMeansAccuracy():
clusterer = KMeans(n_clusters=2, n_init=30)
tdm = pickle.load(open(DATASET_PATH + "BOW.p", "rb"))
predictions = clusterer.fit_predict(tdm)
true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0]
numerical_mapped_1 = [0 if i == "Israeli" else 1 for i in true_labels]
numerical_mapped_2 = [1 if i == "Israeli" else 0 for i in true_labels]
one = f1_score(numerical_mapped_1, predictions)
two = f1_score(numerical_mapped_2, predictions)
print("The F1 score of KMeans on BOW is: " + str(max(one, two)))
clusterer = KMeans(n_clusters=2, n_init=30)
predictions = clusterer.fit_predict(tdm)
true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0]
accuracy = predict_accuracy(true_labels, predictions)
print("The F1 score of KMeans on BOW (w/Tdidf) is: " + accuracy)
pixel_sampling.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def learn_color_clusters():
samples = np.zeros((0, 3))
cnt = 0
with open('train_list') as f:
for line in f:
line = line[:-1]
image = cv2.imread(line)
image = cv2.resize(image, (100, 100))
image = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)
points = image.reshape((-1, 3))
np.random.permutation(points.shape[0])
samples = np.vstack([samples, points[:50]])
print(samples.shape)
cnt = cnt + 1
if cnt % 10000 == 0:
break
km = cluster.KMeans(n_clusters=50, n_jobs=-1)
km.fit(samples)
np.save('lab_clusters.npy', km.cluster_centers_)
return
#learn_color_clusters()
def test_estimator_instance(self):
"""
Test that isestimator works for instances
"""
models = (
LinearRegression(),
LogisticRegression(),
KMeans(),
LSHForest(),
PCA(),
RidgeCV(),
LassoCV(),
RandomForestClassifier(),
)
for model in models:
self.assertTrue(isestimator(model))
def test_estimator_class(self):
"""
Test that isestimator works for classes
"""
models = (
LinearRegression,
LogisticRegression,
KMeans,
LSHForest,
PCA,
RidgeCV,
LassoCV,
RandomForestClassifier,
)
for model in models:
self.assertTrue(inspect.isclass(model))
self.assertTrue(isestimator(model))
def get_cluster_threshold(weights):
estimator = KMeans(n_clusters = 2)
data = np.asarray(weights)
data = data.reshape(-1,1)
# print data
clusters_idx = estimator.fit_predict(data)
max_idx = data.argmax()
max_cluster = clusters_idx[max_idx]
#print max_cluster
low_cluster = []
if max_cluster == 1:
indices = np.argwhere(clusters_idx == 0)
for idx in indices:
low_cluster.append(data[idx])
threshold = max(low_cluster)
threshold = threshold[0][0]
else:
indices = np.argwhere(clusters_idx == 1)
for idx in indices:
low_cluster.append(data[idx])
threshold = max(low_cluster)
threshold = threshold[0][0]
# print threshold
return threshold
def make_clast_books(dict_books_all, array_books_real):
dict_books_clasters = {}
for i in array_books_real:
try:
dict_books_clasters[i] = dict_books_all[i]
except:
dict_books_clasters[i] = [1, 1, 1, 1]
X_array = dict_books_clasters.values()
num_clusters = len(X_array) / 50
k_means = cluster.KMeans(n_clusters=num_clusters)
k_means.fit(X_array)
# ????? ?????????? ????? ????????
clusterized_array = list(k_means.labels_)
for index, i in enumerate(dict_books_clasters.keys()):
dict_books_clasters[i] = clusterized_array[index]
return dict_books_clasters, num_clusters
def __init__(self, league_df):
stat_matrix = []
for i in range(len(league_df)):
stat = make_stat_vector(i, league_df)
stat_matrix.append(stat)
kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
kmeans.fit(stat_matrix)
centroid_array = kmeans.cluster_centers_
positions = kmeans.predict(stat_matrix)
league_df['vector'] = pd.Series(stat_matrix, index = league_df.index)
league_df['position'] = pd.Series(positions, index = league_df.index)
self.df = league_df
self.centroids = kmeans.cluster_centers_
self.map = make_position_map(centroid_array)
def __init__(self, league_df):
stat_matrix = []
for i in range(len(league_df)):
stat = make_stat_vector(i, league_df)
stat_matrix.append(stat)
kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
kmeans.fit(stat_matrix)
centroid_array = kmeans.cluster_centers_
positions = kmeans.predict(stat_matrix)
league_df['vector'] = pd.Series(stat_matrix, index = league_df.index)
league_df['position'] = pd.Series(positions, index = league_df.index)
self.df = league_df
self.centroids = kmeans.cluster_centers_
self.map = make_position_map(centroid_array)
def PQTrain(data, lenSubVec,numSubCenter):
(dataSize, dataDim)=data.shape
if 0!=dataDim%lenSubVec:
print "Cannot partition the feature space with the given segment number"
return
numSubVec=dataDim/lenSubVec
centers=npy.zeros((numSubVec*numSubCenter,lenSubVec),dtype=npy.float32)
distOfCenters=npy.zeros((numSubCenter,numSubCenter,numSubVec),dtype=npy.float32)
objKmeans=KMeans(numSubCenter,'k-means++',3,100,0.001)
for ii in range(numSubVec):
print("PQ training. Processing "+str(ii)+"-th sub-vector")
objKmeans.fit(data[:,ii*lenSubVec:(ii+1)*lenSubVec])
centers[ii*numSubCenter:(ii+1)*numSubCenter,:]= objKmeans.cluster_centers_
distOfCenters[:,:,ii]=squareform(pdist(objKmeans.cluster_centers_,metric="euclidean"))
model={"centers":centers,"distOfCenters":distOfCenters}
return model
def PQEval(data,lenSubVec,numSubCenter,centersPQ):
(dataSize, dataDim)=data.shape
if 0!=dataDim%lenSubVec:
print "Cannot partition the feature space with the given segment number"
return
numSubVec=dataDim/lenSubVec
codePQ=-npy.ones((dataSize, numSubVec),dtype=npy.int32)
objKmeans=KMeans(numSubCenter)
if (centersPQ.shape[0]!=numSubVec*numSubCenter
or centersPQ.shape[1]!=lenSubVec):
print "PQ model dimension is not compatible with input data"
return
for ii in range(numSubVec):
objKmeans.cluster_centers_=centersPQ[ii*numSubCenter:(ii+1)*numSubCenter,:]
codePQ[:,ii]=objKmeans.predict(data[:,ii*lenSubVec:(ii+1)*lenSubVec])
return codePQ
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)
def compute_readpairs_per_umi_threshold(reads, subsample_rate):
''' Compute a threshold above which the UMIs are unlikely to be PCR off-products.
reads (np.array(int)) - Read pairs for each UMI
subsample_rate (float) - Subsample reads to this fraction.
Returns threshold (int) - The RPPU threshold in the subsampled space '''
if len(np.unique(reads)) < 2:
print 'Skipping RPPU threshold calculation.'
return 1
print 'RPPU subsample rate: %0.4f' % subsample_rate
reads = np.random.binomial(reads, subsample_rate)
reads = reads[reads > 0]
if len(np.unique(reads)) < 2:
print 'Subsampling gave a degenerate distribution of RPPU. Skipping RPPU threshold calculation.'
return 1
new_n50 = tk_stats.NX(reads, 0.5)
print 'New N50: %d:' % new_n50
# Log-transform counts
log_reads = np.log(reads)
# Run K-Means. Reshape necessary because kmeans takes a matrix.
kmeans = sk_cluster.KMeans(2).fit(log_reads.reshape((-1,1)))
kmeans.predict(log_reads.reshape((-1,1)))
# Take the cluster with the smallest mean
min_cluster = np.argsort(np.ravel(kmeans.cluster_centers_))[0]
print 'RPPU component means: ' + str(list(iter(np.exp(kmeans.cluster_centers_))))
print 'RPPU component members: ' + str(np.bincount(kmeans.labels_))
# Take the max element in the min-cluster
threshold = np.max(reads[kmeans.labels_ == min_cluster])
return threshold
def fit(self, X):
_X = X[self.__applicable_rows(X)]
companies = _X.groupby('recipient_id').apply(self.__company_stats) \
.reset_index()
companies = companies[self.__applicable_company_rows(companies)]
self.cluster_model = KMeans(n_clusters=3)
self.cluster_model.fit(companies[self.CLUSTER_KEYS])
companies['cluster'] = self.cluster_model.predict(companies[self.CLUSTER_KEYS])
self.clusters = companies.groupby('cluster') \
.apply(self.__cluster_stats) \
.reset_index()
self.clusters['threshold'] = \
self.clusters['mean'] + 4 * self.clusters['std']
return self
def get_clusters_from_frames(frame_dir=None):
# TODO: allow multiple frame directories to be processed at once
if frame_dir is None:
filename_to_embedding = pickle.load(open('temp/temp_vid1_290717183249/filename_to_emb.pkl')) # TODO: call get_inception_embeddings on frame dir, but for now just use the pickle
embs = []
filenames = []
for filename, embedding in filename_to_embedding.iteritems():
embs.append(embedding)
filenames.append(filename)
filenames = [filename[filename.rindex('/')+1:] for filename in filenames]
embs = np.array(embs)
candidates = [(11, 6)]
candidates = [(eps, min_pts) for eps in range(7, 15) for min_pts in range(2, 10)]
labels = cluster(embs, filenames, algorithm='KMeans', n_clusters=6)
def kmeans(X, K):
km = KMeans(K).fit(X)
return km.cluster_centers_
def main():
features = []
for i in list:
im = cv2.imread(i)
hist, bins = np.histogram(im.ravel(), 256, [0, 256])
features.append(hist)
lsa = TruncatedSVD(10)
features = lsa.fit_transform(features)
features = Normalizer(copy = False).fit_transform(features)
km = KMeans(
init='k-means++',
n_clusters=n_clusters,
)
km.fit(features)
for i in range(n_clusters):
if not os.path.exists('./result/' + str(i)):
os.makedirs('./result/' + str(i))
cnt = 0
for i in list:
filename = i.split('/')[-1]
print filename,
print km.labels_[cnt]
shutil.copyfile(i, './result/' + str(km.labels_[cnt]) + '/' + filename)
cnt += 1
def _discretize_by_kmeans(col, num_bins, random_state):
nan_idx = col[col.isnull()].index
kmeans = KMeans(n_clusters=num_bins, random_state=random_state)
kmeans = kmeans.fit(col.dropna().values.T.reshape(-1, 1))
group = kmeans.labels_
if col.isnull().sum() > 0:
group = group.astype(float)
for idx in nan_idx:
group = np.insert(group,idx,np.nan)
return pd.Series(group)