def palettise(data, n_entries=256):
height = len(data)
width = len(data[0])
all_colours = sum(data, [])
print("Calculating pallete...")
kmeans = MiniBatchKMeans(n_clusters=n_entries, random_state=0).fit(all_colours)
pallete = [list(map(int, rgb)) for rgb in kmeans.cluster_centers_]
print("Dithering...") # Floyd–Steinberg dithering
for y in range(height):
print("\r{:.1f}%".format((y/height)*100), end="")
for x in range(width):
bucket = kmeans.predict([data[y][x]])[0]
error = [a-b for a, b in zip(data[y][x], pallete[bucket])]
data[y][x] = bucket
for dx, dy, coef in [(1, 0, 7/16), (-1, 1, 3/16), (0, 1, 5/16), (1, 1, 1/16)]:
xn = x + dx
yn = y + dy
if ( 0 <= xn < width and 0 <= yn < height ):
data[yn][xn] = [a+b*coef for a, b in zip(data[yn][xn], error)]
print("\r100% ")
return data, pallete
python类MiniBatchKMeans()的实例源码
def k_means(self, n_clusters, batch_size=1000):
"""
Perform K-mean clustering
Parameters
----------
n_clusters : int
number of clusters
batch_size : int
the bath size for the MiniBatchKMeans algorithm
"""
from sklearn.cluster import MiniBatchKMeans
pars = {"batch_size": batch_size, 'is_hierarchical': False,
"metric": self.metric}
km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
n_init=10,
init_size=batch_size, batch_size=batch_size)
return self._cluster_func(n_clusters, km, pars)
def kmeans_aic(model, X, **kwargs):
'''AIC (Akaike Information Criterion) for k-means for model selection
Parameters:
:model: An elm.pipeline.Pipeline with KMeans or MiniBatchKMeans as final step in Pipeline
:X: The X data that were just given to "fit", or "partial_fit"
:kwargs: placeholder - ignored
Returns:
:AIC: float
'''
k, m = model._estimator.cluster_centers_.shape
if isinstance(X, xr.DataArray):
n = X.flat.values.shape[0]
else:
n = X.shape[0]
d = model._estimator.inertia_
aic = d + 2 * m * k
delattr(model._estimator, 'labels_')
return aic
def test_clusterer_enforcement(self):
"""
Assert that only clustering estimators can be passed to cluster viz
"""
nomodels = [
SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier
]
for nomodel in nomodels:
with self.assertRaises(YellowbrickTypeError):
visualizer = ClusteringScoreVisualizer(nomodel())
models = [
KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch
]
for model in models:
try:
visualizer = ClusteringScoreVisualizer(model())
except YellowbrickTypeError:
self.fail("could not pass clustering estimator to visualizer")
def avg_within_ss(X, k):
"""
Compute the average within-cluster sum of squares. The code here can be
found "almost" anywhere online
Params:
--------
X: numpy array with observations and features to be clustered
k: number of clusters
Returns:
--------
avgwithinss: average within-cluster sum of squares
"""
model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
n_init=3, max_no_improvement=10, verbose=0)
model.fit(X)
centroids = model.cluster_centers_
dist_c = cdist(X, centroids, 'euclidean')
dist = np.min(dist_c, axis=1)
avgwithinss = sum(dist**2)/X.shape[0]
return avgwithinss
def fit(self, descs, MiniBatchKMeans=True, batch_size=10000, preprocess=False):
"""Training"""
"""
if preprocess:
self.stdSlr = StandardScaler().fit(descs)
descs = self.stdSlr.transform(descs)
else:
self.stdSlr = None
"""
if MiniBatchKMeans:
self.centers = self.MiniBatchKMeans(descs, batch_size)
else:
self.centers = self.Kmeans(descs)
if preprocess:
self.stdSlr = StandardScaler().fit(descs)
#descs = self.stdSlr.transform(descs)
else:
self.stdSlr = None
return self.centers
def test_k_means_explicit_init_shape():
# test for sensible errors when giving explicit init
# with wrong number of features or clusters
rnd = np.random.RandomState(0)
X = rnd.normal(size=(40, 3))
for Class in [KMeans, MiniBatchKMeans]:
# mismatch of number of features
km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
msg = "does not match the number of features of the data"
assert_raises_regex(ValueError, msg, km.fit, X)
# for callable init
km = Class(n_init=1, init=lambda X_, k, random_state: X_[:, :2], n_clusters=len(X))
assert_raises_regex(ValueError, msg, km.fit, X)
# mismatch of number of clusters
msg = "does not match the number of clusters"
km = Class(n_init=1, init=X[:2, :], n_clusters=3)
assert_raises_regex(ValueError, msg, km.fit, X)
# for callable init
km = Class(n_init=1, init=lambda X_, k, random_state: X_[:2, :], n_clusters=3)
assert_raises_regex(ValueError, msg, km.fit, X)
def test_minibatch_sensible_reassign_fit():
# check if identical initial clusters are reassigned
# also a regression test for when there are more desired reassignments than
# samples.
zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
cluster_std=1., random_state=42)
zeroed_X[::2, :] = 0
mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
init="random")
mb_k_means.fit(zeroed_X)
# there should not be too many exact zero cluster centers
assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
# do the same with batch-size > X.shape[0] (regression test)
mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
random_state=42, init="random")
mb_k_means.fit(zeroed_X)
# there should not be too many exact zero cluster centers
assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
def test_sparse_mb_k_means_callable_init():
def test_init(X, k, random_state):
return centers
# Small test to check that giving the wrong number of centers
# raises a meaningful error
msg = "does not match the number of clusters"
assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
random_state=42).fit,
X_csr)
# Now check that the fit actually works
mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
random_state=42).fit(X_csr)
_check_fitted_model(mb_k_means)
def test_mini_batch_k_means_random_init_partial_fit():
km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)
# use the partial_fit API for online learning
for X_minibatch in np.array_split(X, 10):
km.partial_fit(X_minibatch)
# compute the labeling on the complete dataset
labels = km.predict(X)
assert_equal(v_measure_score(true_labels, labels), 1.0)
def test__ClusteringWithSupervision_clusters():
"""
Check that we change the number of clusters properly.
We have a weird interface here where we sort of overload
`n_clusters` but try to hide it.
"""
train, classes = make_X_y()
model = ClusteringWithSupervision(cluster_instance=MiniBatchKMeans())
assert model.n_clusters is None
assert model.get_params()['n_clusters'] is None
assert model.cluster_instance.n_clusters == 8
assert model._cluster_instance is None
model.fit(train, classes)
assert model.n_clusters is None
assert model.get_params()['n_clusters'] is None
assert model.cluster_instance.n_clusters == 8
assert model._cluster_instance.n_clusters == 4
def bow_codebook(data, K=64):
km = MiniBatchKMeans(n_clusters=K, init='k-means++',
compute_labels=False, batch_size=1000, max_iter=150, max_no_improvement=30,
verbose=False).fit(data)
return km.cluster_centers_
def test_basic(self, single_chunk_blobs):
X, y = single_chunk_blobs
a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0)
b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0)
a.fit(X)
b.partial_fit(X)
assert_estimator_equal(a, b, exclude=['random_state_'])
def mini_batch(fig):
global X_iris, geo
ax = fig.add_subplot(geo + 2, projection='3d', title='mini-batch')
mini_batch = cluster.MiniBatchKMeans(init='random', n_clusters=3)
mini_batch.fit(X_iris)
res = mini_batch.labels_
for n, i in enumerate(X_iris):
ax.scatter(*i[: 3], c='bgrcmyk'[res[n] % 7], marker='o')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
return res
def train(X, y, true_k=50, minibatch=False, showLable=True):
# ??????????????k-means?
fout = open('pro1_cluster.txt', 'w+')
if minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
km.fit(X)
print y.dtype
if showLable:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = y
# print y
result = list(km.predict(X))
print('Cluster distribution:')
print(dict([(i, result.count(i)) for i in result]))
cluster_list = {}
for i in range(true_k):
cluster_list[i] = []
for j in range(len(result)):
# print terms[j]
# print result[j]
cluster_list[result[j]].append([terms[j], X[j]])
for i in cluster_list.keys():
cluster = cluster_list[i]
if len(cluster) > 0:
for bet in cluster:
vec = bet[1].tolist()
# fout.write(bet[0] + str(vec) + '\n')
# print bet
fout.write(bet[0] + '\n')
fout.write('-------------------\n')
return -km.score(X)
fout.close()
clustering.py 文件源码
项目:oss-github-analysis-project
作者: itu-oss-project-team
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def minibatchs_k_means_clustering(self, out_path, pd_data, number_of_clusters):
headers, repos, features = self.__fetch_data(pd_data)
mb_kmeans = MiniBatchKMeans(n_clusters=number_of_clusters)
mb_kmeans.fit(features)
clusters = []
for i in range(0, number_of_clusters): # k cluster
repo_list = []
for j in range (0, len(mb_kmeans.labels_)): # a label for each repo.
if i == mb_kmeans.labels_[j]: # if repo label is equal to Cluster number
repo_list.append(repos[j]) # add repo to cluster i's list.
clusters.append(repo_list)
out_file_path = os.path.join(out_path, "mb_kmeans_noOfClusters" + str(number_of_clusters))
self.__export_k_means_results(mb_kmeans, headers, clusters, out_file_path) # avoid ".csv"
def __init__(self, n_codewords, normalization=3, inner_batch=128,
dimension_ordering="tf"):
self.n_codewords = n_codewords
self.inner_batch = inner_batch
self.normalization = normalization
self._clusterer = cluster.MiniBatchKMeans(
n_clusters=self.n_codewords,
n_init=1,
compute_labels=False
)
super(self.__class__, self).__init__(dimension_ordering)
def __init__(self, n_codewords, neighbors=5, beta=1e-4, dimension_ordering="tf"):
self.n_codewords = n_codewords
self.neighbors = neighbors
self.beta = beta
self._clusterer = cluster.MiniBatchKMeans(
n_clusters=self.n_codewords,
n_init=1,
compute_labels=False
)
super(self.__class__, self).__init__(dimension_ordering)
def __init__(self, n_codewords, l1_norm=True, dimension_ordering="tf"):
self.n_codewords = n_codewords
self.l1_norm = l1_norm
self._clusterer = cluster.MiniBatchKMeans(
n_clusters=self.n_codewords,
n_init=1,
compute_labels=False
)
super(self.__class__, self).__init__(dimension_ordering)
def initial_centers(self, img_output):
C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
print "#DVSQ train# initilizing Centers"
all_output = img_output
for i in xrange(self.subspace_num):
kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
print "step: ", i, " finish"
return C_init
def KMEANS(data, k):
if data.shape[0] < 20000:
centroids, cluster_IDs, _ = k_means(data, k, init = 'k-means++', precompute_distances = 'auto', n_init = 20, max_iter = 200)
else:
mbkm = MiniBatchKMeans(k, 'k-means++', max_iter = 100, batch_size = data.shape[0] / k, n_init = 20)
mbkm.fit(data)
centroids = mbkm.cluster_centers_
cluster_IDs = mbkm.labels_
return centroids, cluster_IDs
def make_example_y_data(X, y=None, sample_weight=None, **kwargs):
fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values)
y = fitted.predict(X.flat.values)
return (X, y, sample_weight)
def initial_centers(self, img_output, txt_output):
C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
print "#cdq train# initilizing Centers"
all_output = np.vstack([img_output, txt_output])
for i in xrange(self.subspace_num):
kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
print "step: ", i, " finish"
return C_init
def _kmeans_clustering(self, X, n_clusters, batch_size=128):
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size,
n_init=10, max_no_improvement=10)
kmeans.fit(X)
return kmeans.cluster_centers_
def generate_codebook(image, detectAndCompute=SIFT_create().detectAndCompute):
descriptors = detectAndCompute(image, window_size=None)
kmeans = MiniBatchKMeans(n_clusters=2048, batch_size=128,
n_init=10, max_no_improvement=10)
kmeans.fit(descriptors)
codebook = kmeans.cluster_centers_[:]
return codebook
def cluster(self):
mbk = MiniBatchKMeans(n_clusters=self.K, batch_size=self.K*2, verbose=self.verbose, compute_labels=False)
if self.subsample is None:
data = np.vstack([self.data[k] for k in self.data.keys() if self.data[k] is not None])
mbk.fit(data)
else: # sample number of files
fnames = self.data.keys()
subset = random.sample(fnames, int(self.subsample * len(fnames)))
subdata = np.vstack([self.data[k] for k in subset if self.data[k] is not None])
mbk.fit(subdata)
return mbk.cluster_centers_
def test_kmeans(self):
n_clusters = 5
X, y = make_blobs(n_samples=1000, centers=n_clusters, random_state=0)
centers, labels = utils.kmeans(X, n_clusters)
clf = MiniBatchKMeans(n_clusters=n_clusters)
assert len(labels) == len(clf.fit_predict(X))
assert len(centers) == len(clf.cluster_centers_)
def kmeans(X, n_clusters, **kwargs):
"""Classify vectors in X using K-Means algorithm with n_clusters.
Arguments in kwargs are passed to scikit-learn MiniBatchKMeans.
Returns a tuple of cluster centers and predicted labels."""
clf = MiniBatchKMeans(n_clusters=n_clusters, **kwargs)
labels = clf.fit_predict(X)
centers = clf.cluster_centers_.astype(np.ubyte)
return centers, labels
def kmeans_classifier(prediction, ticket_predict_weights, ticket_target_list, tickets_to_weights_matrix):
kmeans = MiniBatchKMeans(n_clusters=len(ticket_target_list), init_size=len(tickets_to_weights_matrix) + 1)
kmeans.fit(tickets_to_weights_matrix)
predicted_class = kmeans.predict(ticket_predict_weights)[0]
print "kmeans prediction: {}".format(ticket_target_list[predicted_class])
if prediction is not None:
prediction.append([ticket_target_list[predicted_class]])
def test_integrated_mini_batch_kmeans_elbow(self):
"""
Test no exceptions for mini-batch kmeans k-elbow visualizer
See #182: cannot use occupancy dataset because of memory usage
"""
# Generate a blobs data set
X,y = make_blobs(
n_samples=1000, n_features=12, centers=6, shuffle=True
)
try:
visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4)
visualizer.fit(X)
visualizer.poof()
except Exception as e:
self.fail("error during k-elbow: {}".format(e))