def test_integrated_mini_batch_kmeans_silhouette(self):
"""
Test no exceptions for mini-batch kmeans silhouette visualizer
See #182: cannot use occupancy dataset because of memory usage
"""
# Generate a blobs data set
X, y = make_blobs(
n_samples=1000, n_features=12, centers=8, shuffle=True,
)
try:
visualizer = SilhouetteVisualizer(MiniBatchKMeans())
visualizer.fit(X)
visualizer.poof()
except Exception as e:
self.fail("error during silhouette: {}".format(e))
python类MiniBatchKMeans()的实例源码
def perc_var_explained(X,k):
"""
Compute the percentage of variance explained defined as between sum of squares
divided but the total sum of squares.
WARNING: It will take a while.
The code here can be found "almost" anywhere online.
Params:
--------
X: numpy array with observations and features to be clustered
k: number of clusters
Returns:
--------
pve: percentage of variance explained
"""
model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
n_init=3, max_no_improvement=10, verbose=0)
model.fit(X)
centroids = model.cluster_centers_
dist_c = cdist(X, centroids, 'euclidean')
dist = np.min(dist_c, axis=1)
tot_withinss = sum(dist**2)
totss = sum(pdist(X)**2)/X.shape[0]
betweenss = totss - tot_withinss
pve = (betweenss/totss *100)
return pve
def bic(X, k):
"""
Compute the BIC score.
Implementarion from here:
http://www.aladdin.cs.cmu.edu/papers/pdfs/y2000/xmeans.pdf
with corrections from here:
https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans
Params:
--------
X: numpy array with observations and features to be clustered
k: number of clusters
Returns:
--------
BIC: bic score
"""
model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
n_init=3, max_no_improvement=10, verbose=0)
model.fit(X)
centers = model.cluster_centers_
centers = np.expand_dims(centers, axis=1)
labels = model.labels_
N_C = np.bincount(labels)
R, M = X.shape
wcss = sum([sum(cdist(X[np.where(labels == c)], centers[c], 'euclidean')**2) for c in range(k)])
var = (1.0/(R-k)/M) * wcss
const_term = 0.5 * k * np.log(R) * (M+1)
BIC = np.sum([ ( Rn * np.log(Rn) ) -
( Rn * np.log(R) ) -
( ((Rn * M) / 2) * np.log(2*np.pi*var) ) -
( (Rn - 1) * M/ 2 )
for Rn in N_C]) - const_term
return BIC
def test_scenarios():
"""Test that everything works"""
for s in scenarios():
seeding = kmc2.kmc2(**s)
with warnings.catch_warnings():
warnings.simplefilter("ignore") # disable sklearn warnings
model = MiniBatchKMeans(s["k"], init=seeding).fit(s["X"])
new_centers = model.cluster_centers_
def gen_cluster(keys = None, cluster_matrix = None):
km = MiniBatchKMeans(n_clusters=50, batch_size=1000)
# km = KMeans(n_jobs=-1, n_clusters=50)
print "Clustering data..."
labels = pd.DataFrame(km.fit_predict(cluster_matrix.values))
res = pd.concat([keys, labels], axis = 1, ignore_index=True)
return res
def gen_cluster(keys = None, cluster_matrix = None):
assert cluster_matrix and keys
km = MiniBatchKMeans(n_clusters=50, batch_size=1000)
labels = pd.DataFrame(km.fit_predict(cluster_matrix.values))
res = pd.concat([keys, labels], axis = 1, ignore_index=True)
return res
def fit(self, descs, preprocess=True):
if preprocess:
self.stdSlr = StandardScaler()
self.stdSlr.fit(descs)
tmp = self.stdSlr.transform(descs)
else:
tmp = descs
self.stdSlr = None
kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.num_clusters, batch_size=10000)
kmeans.fit(tmp)
self.centers = kmeans.cluster_centers_
self.clusters = kmeans.labels_
return self.centers
def MiniBatchKMeans(self, X, batch=10000):
print("in fit method", X.shape, self.k)
kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.k, batch_size=batch)
kmeans.fit(X)
centers = kmeans.cluster_centers_
clusters = kmeans.labels_
print("shape of centers is ", centers.shape)
return centers
def initial_centers(self, img_output):
C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
print "#DVSQ train# initilizing Centers"
all_output = img_output
for i in xrange(self.subspace_num):
kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
print "step: ", i, " finish"
return C_init
def initial_centers(self, img_output):
C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
print "#ZDQ train# initilizing Centers"
all_output = img_output
for i in xrange(self.subspace_num):
kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
print "step: ", i, " finish"
return C_init
def test_mb_k_means_plus_plus_init_dense_array():
mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
random_state=42)
mb_k_means.fit(X)
_check_fitted_model(mb_k_means)
def test_mb_kmeans_verbose():
mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
random_state=42, verbose=1)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
mb_k_means.fit(X)
finally:
sys.stdout = old_stdout
def test_mb_k_means_plus_plus_init_sparse_matrix():
mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
random_state=42)
mb_k_means.fit(X_csr)
_check_fitted_model(mb_k_means)
def test_minibatch_init_with_large_k():
mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
# Check that a warning is raised, as the number clusters is larger
# than the init_size
assert_warns(RuntimeWarning, mb_k_means.fit, X)
def test_minibatch_k_means_random_init_sparse_csr():
# increase n_init to make random init stable enough
mb_k_means = MiniBatchKMeans(init="random", n_clusters=n_clusters,
random_state=42, n_init=10).fit(X_csr)
_check_fitted_model(mb_k_means)
def test_minibatch_k_means_perfect_init_dense_array():
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
random_state=42, n_init=1).fit(X)
_check_fitted_model(mb_k_means)
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers():
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
random_state=42, n_init=10)
assert_warns(RuntimeWarning, mb_k_means.fit, X)
def test_minibatch_k_means_perfect_init_sparse_csr():
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
random_state=42, n_init=1).fit(X_csr)
_check_fitted_model(mb_k_means)
def test_minibatch_with_many_reassignments():
# Test for the case that the number of clusters to reassign is bigger
# than the batch_size
n_samples = 550
rnd = np.random.RandomState(42)
X = rnd.uniform(size=(n_samples, 10))
# Check that the fit works if n_clusters is bigger than the batch_size.
# Run the test with 550 clusters and 550 samples, because it turned out
# that this values ensure that the number of clusters to reassign
# is always bigger than the batch_size
n_clusters = 550
MiniBatchKMeans(n_clusters=n_clusters,
batch_size=100,
init_size=n_samples,
random_state=42).fit(X)
def test_minibatch_default_init_size():
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
batch_size=10, random_state=42,
n_init=1).fit(X)
assert_equal(mb_k_means.init_size_, 3 * mb_k_means.batch_size)
_check_fitted_model(mb_k_means)