def compress_image(img, num_clusters):
# Convert input image into (num_samples, num_features)
# array to run kmeans clustering algorithm
X = img.reshape((-1, 1))
# Run kmeans on input data
kmeans = cluster.KMeans(n_clusters=num_clusters, n_init=4, random_state=5)
kmeans.fit(X)
centroids = kmeans.cluster_centers_.squeeze()
labels = kmeans.labels_
# Assign each value to the nearest centroid and
# reshape it to the original image shape
input_image_compressed = np.choose(labels, centroids).reshape(img.shape)
return input_image_compressed
python类KMeans()的实例源码
vector_quantization.py 文件源码
项目:Python-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def k_means_cluster_Predict(data_list,info):
array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
ks = list(range(1,len(info)))
KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
ks_picked=ks[BIC.index(max(BIC))]
if ks_picked==1:
return [data_list]
else:
out=[]
std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
whitened = whiten(array_diagnal)
centroids, distortion=kmeans(whitened,ks_picked)
idx,_= vq(whitened,centroids)
for x in range(ks_picked):
group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
out.append(group1)
return out
def run_kmeans(transformed_pca_matrix, n_clusters, random_state=None):
if random_state is None:
random_state=cr_constants.RANDOM_STATE
kmeans = sk_cluster.KMeans(n_clusters=n_clusters, random_state=random_state)
clusters = kmeans.fit_predict(transformed_pca_matrix) + 1
cluster_score = compute_db_index(transformed_pca_matrix, kmeans)
clusters = cr_clustering.relabel_by_size(clusters)
clustering_key = cr_clustering.format_clustering_key(cr_clustering.CLUSTER_TYPE_KMEANS, n_clusters)
return cr_clustering.create_clustering(clusters=clusters,
num_clusters=n_clusters,
cluster_score=cluster_score,
clustering_type=cr_clustering.CLUSTER_TYPE_KMEANS,
global_sort_key=n_clusters,
description=cr_clustering.humanify_clustering_key(clustering_key))
def step4():
key_vec = pickle.loads(open("key_vec.pkl", "rb").read())
vecs = []
for ev, vec in enumerate(key_vec.values()):
x = np.array(vec)
if np.isnan(x).any():
# print(vec)
continue
vecs.append(x)
vecs = np.array(vecs)
kmeans = KMeans(n_clusters=128, init='k-means++', n_init=10, max_iter=300,
tol=0.0001,precompute_distances='auto', verbose=0,
random_state=None, copy_x=True, n_jobs=1)
print("now fitting...")
kmeans.fit(vecs)
open("kmeans.model", "wb").write( pickle.dumps(kmeans) )
for p in kmeans.predict(vecs):
print(p)
def cluster(data,true_labels,n_clusters=3):
km = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
km.fit(data)
km_means_labels = km.labels_
km_means_cluster_centers = km.cluster_centers_
km_means_labels_unique = np.unique(km_means_labels)
colors_ = cycle(colors.cnames.keys())
initial_dim = np.shape(data)[1]
data_2 = tsne(data,2,initial_dim,30)
plt.figure(figsize=(12, 6))
plt.scatter(data_2[:,0],data_2[:,1], c=true_labels)
plt.title('True Labels')
return km_means_labels
def init_centers_widths(self, R):
"""Initialize prior of centers and widths
Returns
-------
centers : 2D array, with shape [K, n_dim]
Prior of factors' centers.
widths : 1D array, with shape [K, 1]
Prior of factors' widths.
"""
kmeans = KMeans(
init='k-means++',
n_clusters=self.K,
n_init=10,
random_state=100)
kmeans.fit(R)
centers = kmeans.cluster_centers_
widths = self._get_max_sigma(R) * np.ones((self.K, 1))
return centers, widths
def all_cluster():
#????????between??
# bet_dic = {}
# fin = open('sort_between.txt', 'r')
# while True:
# line = fin.readline()
# if line:
# line = line.strip()
# between, vec = line.split('^')
# vec = vec.strip('[')
# vec = vec.strip(']')
# vec = vec.split(',')
# bet_dic[between] = vec
#
# else:
# break
# bet_dic = pd.DataFrame(bet_dic)
# bet_dic = bet_dic.T
# bet_dic.to_csv('dataframe.csv')
# fin.close()
df = pd.read_csv('dataframe.csv')
clf = KMeans(n_clusters=50)
s = clf.fit(df[1:, 1:])
print s
clustering.py 文件源码
项目:oss-github-analysis-project
作者: itu-oss-project-team
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def k_means_clustering(self, out_path, pd_data, number_of_clusters):
headers, repos, features = self.__fetch_data(pd_data)
kmeans = KMeans(n_clusters=number_of_clusters, random_state=0, n_init=200).fit(features) # apply kmeans algorithm
# form clusters
clusters = []
for i in range(0, number_of_clusters): # k cluster
repo_list = []
for j in range (0, len(kmeans.labels_)): # a label for each repo.
if i == kmeans.labels_[j]: # if repo label is equal to Cluster number
repo_list.append(repos[j]) # add repo to cluster i's list.
clusters.append(repo_list)
out_file_path = os.path.join(out_path, "kmeans_noOfClusters" + str(number_of_clusters))
self.__export_k_means_results(kmeans, headers, clusters, out_file_path) # avoid ".csv"
def cluster(X, seed=0, n_clusters=20, alg='kmeans'):
"""
Perform k-means on given X data. For alg, use one of:
'kmeans' (sklearn KMeans) or 'spherical' (SphericalKMeans)
returns (X pred clusters, cluster centers)
NOTE: euclidean tends to perform very poorly
"""
# log("Clustering k-means with {} clusters".format(n_clusters))
if alg == 'kmeans':
Model = KMeans
elif alg == 'spherical':
# inplace l2 normalization (spherical k-means assumes this)
normalize(X, 'l2', copy=False)
Model = SphericalKMeans
kmeans = Model(
n_clusters=int(n_clusters), random_state=seed
)
pred_clusters = kmeans.fit_predict(X)
return pred_clusters, kmeans.cluster_centers_
def __init__(self, edges, branching_factor=50, threshold=0.1):
# Make features list.
features = []
for i in range(len(edges)):
edge = edges[i]
features.append([edge['perimeter'], edge['area'],
edge['shape_factor'], edge['radius_deviation']])
features = np.array(features)
# Normalize features
normed_features = features.copy()
for i in range(features.shape[1]):
avg = np.median(features[::, i])
std = np.std(features[::, i])
normed_features[::, i] -= avg
normed_features[::, i] /= avg
self.features = features
self.normed_features = normed_features
self.branching_factor = branching_factor
self.threshold = threshold
#self.run(Birch, branching_factor=50, threshold=0.1, n_clusters=2)
self.run(KMeans, n_clusters=2)
#self.run(AgglomerativeClustering, n_clusters=2)
def color_differenciate(img:Image,k:int):
imgarr = img2array(img)
imgarr_r = imgarr.reshape((imgarr.shape[0] * imgarr.shape[1], 3))
clt =KMeans(n_clusters = k)
clt.fit(imgarr_r)
numLabels = np.arange(0, len(np.unique(clt.labels_)) + 1)
images=[]
for i in range(len(numLabels)):
images.append(np.ones(imgarr_r.shape,dtype=np.int32)*255)
for idx in range(len(clt.labels_)):
label=clt.labels_[idx]
images[label][idx][0]=imgarr_r[idx][0]
images[label][idx][1] = imgarr_r[idx][1]
images[label][idx][2] = imgarr_r[idx][2]
new_images=[]
for i in range(len(numLabels)):
new_img=array2img(images[i].reshape(imgarr.shape))
new_img.save('test_'+str(i)+'.jpg')
new_images.append(new_img)
return new_images
def get_plot(x, y, k, iris=iris):
k_means = KMeans(n_clusters= k)
k_means.fit(iris.data)
colormap = rainbow(np.linspace(0, 1, k))
fig = plt.figure()
splt = fig.add_subplot(1, 1, 1)
splt.scatter(iris.data[:,x], iris.data[:,y], c = colormap[k_means.labels_], s=40)
splt.scatter(k_means.cluster_centers_[:,x], k_means.cluster_centers_[:,y], c = 'black', marker='x')
splt.set_xlabel(iris.feature_names[x])
splt.set_ylabel(iris.feature_names[y])
figfile = BytesIO()
plt.savefig(figfile, format='png')
figfile.seek(0)
figdata_png = base64.b64encode(figfile.getvalue()).decode()
return figdata_png
def update():
# Get the current slider values
N = clusters.value
x_var = axis_map[x_axis.value]
y_var = axis_map[y_axis.value]
k_means = KMeans(n_clusters=N)
k_means.fit(iris.data)
centroids = k_means.cluster_centers_
palette = sns.palettes.color_palette('hls', N)
colormap = np.array(palette.as_hex())[k_means.labels_] # as hex is necessary for bokeh to render the colors properly.
plot.xaxis.axis_label = x_axis.value
plot.yaxis.axis_label = y_axis.value
source.data = dict(
x=iris.data[:,x_var],
y=iris.data[:,y_var],
colors=colormap)
centers.data = dict(
cx=centroids[:,x_var],
cy=centroids[:,y_var])
def KmeansWrapper(true_k, data, load=False):
from sklearn.externals import joblib
modelName = 'doc_cluster.%s.plk' % true_k
if load:
km = joblib.load(modelName)
labels = km.labels_
else:
km = KMeans(n_clusters=true_k,
init='k-means++',
# max_iter=1000,
n_init=10,
n_jobs=-1,
random_state=0,
verbose=0)
km.fit_predict(data)
labels = km.labels_
joblib.dump(km, modelName)
return labels, km.cluster_centers_
def elbowMethod(X, k=21):
distortions = []
for i in range(1, k):
km2 = KMeans(n_clusters=i,
init='k-means++',
n_init=10,
random_state=0,
n_jobs=-1,
verbose=0)
km2.fit(X)
distortions.append(km2.inertia_)
print('k=%s, Distortion: %.2f' % (i, km2.inertia_))
plt.plot(range(1, k), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()
def gridSearch(data, params, true_k):
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=True,
sublinear_tf=True,
analyzer='word')
lr_tfidf = Pipeline([('vect', tfidf),
('clf', KMeans(init='k-means++',
n_jobs=-1,
random_state=0,
verbose=0))])
gsTfIdf = GridSearchCV(
lr_tfidf, params, n_jobs=1, verbose=1)
gsTfIdf.fit(data)
print()
print("Best score: %0.3f" % gsTfIdf.best_score_)
print("Best parameters set:")
best_parameters = gsTfIdf.best_estimator_.get_params()
for param_name in sorted(params.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
def k_means_cluster(data_list):
if max(data_list[0])-min(data_list[0])>10 and max(data_list[1])-min(data_list[1])>10:
array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
ks = list(range(1,min([5,len(data_list[0])+1])))
KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
KMeans_predict=[cluster.KMeans(n_clusters = i, init="k-means++").fit_predict(array_diagnal) for i in ks]
BIC=[]
BIC_rec=[]
for x in ks:
if KMeans_predict[x-1].max()<x-1: continue
else:
BIC_i=compute_bic(KMeans[x-1],array_diagnal)
if abs(BIC_i)<10**8:
BIC.append(BIC_i)
BIC_rec.append(x)
#BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
#ks_picked=ks[BIC.index(max(BIC))]
ks_picked=BIC_rec[BIC.index(max(BIC))]
if ks_picked==1:
return [data_list]
else:
out=[]
std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
whitened = whiten(array_diagnal)
centroids, distortion=kmeans(whitened,ks_picked)
idx,_= vq(whitened,centroids)
for x in range(ks_picked):
group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
out.append(group1)
return out
else:
return [data_list]
def kmeans_aic(model, X, **kwargs):
'''AIC (Akaike Information Criterion) for k-means for model selection
Parameters:
:model: An elm.pipeline.Pipeline with KMeans or MiniBatchKMeans as final step in Pipeline
:X: The X data that were just given to "fit", or "partial_fit"
:kwargs: placeholder - ignored
Returns:
:AIC: float
'''
k, m = model._estimator.cluster_centers_.shape
if isinstance(X, xr.DataArray):
n = X.flat.values.shape[0]
else:
n = X.shape[0]
d = model._estimator.inertia_
aic = d + 2 * m * k
delattr(model._estimator, 'labels_')
return aic
def _init(self, X, lengths=None):
super(GaussianHMM, self)._init(X, lengths=lengths)
_, n_features = X.shape
if hasattr(self, 'n_features') and self.n_features != n_features:
raise ValueError('Unexpected number of dimensions, got %s but '
'expected %s' % (n_features, self.n_features))
self.n_features = n_features
if 'm' in self.init_params or not hasattr(self, "means_"):
kmeans = cluster.KMeans(n_clusters=self.n_components,
random_state=self.random_state)
kmeans.fit(X)
self.means_ = kmeans.cluster_centers_
if 'c' in self.init_params or not hasattr(self, "covars_"):
cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
if not cv.shape:
cv.shape = (1, 1)
self._covars_ = distribute_covar_matrix_to_match_covariance_type(
cv, self.covariance_type, self.n_components).copy()
def ConsensusCluster(self, data, subsamples, subsample_fraction, norm_var, kvalues):
"""
Performs consensus clustering algorithms here!!!
"""
return
partition = dict()
stuff = []
nb_clusters = 0 # this is the number of cluster the dataset is supposed to be partitioned into
distances = nx.to_numpy_matrix(data)
for i in kvalues:
clusterid, error, nfound = KMeans(distances, nclusters= i, npass=300)
uniq_ids = list(set(clusterid))
new_ids = [ uniq_ids.index(val) for val in clusterid]
for i,value in enumerate(new_ids):
partition[i] = value
stuff.append(partition)
def fit(self, data):
""" fit model on data """
self.data = data
kmeans = KMeans(n_clusters=self.n_clusters)
kmeans.fit(data)
self.clusterer = kmeans
logging.info('Fit has been completed')
self.data_clusters = self.clusterer.predict(data)
self.cluster_centers = self.clusterer.cluster_centers_
logging.info('Cluster calculation has been completed')
self.__clusters_separation()
logging.info('Cluster separation has been completed')
self.__cluster_avg_distances()
logging.info('Cluster avg distances has been calculated')
def stratify_by_features(features, n_strata, **kwargs):
"""Stratify by clustering the items in feature space
Parameters
----------
features : array-like, shape=(n_items,n_features)
feature matrix for the pool, where rows correspond to items and columns
correspond to features.
n_strata : int
number of strata to create.
**kwargs :
passed to sklearn.cluster.KMeans
Returns
-------
Strata instance
"""
n_items = features.shape[0]
km = KMeans(n_clusters=n_strata, **kwargs)
allocations = km.fit_predict(X=features)
return Strata(allocations)
def cluster(centers):
n_class = int(len(centers) * 0.18)
est = KMeans(n_clusters=n_class, max_iter=1000)
est.fit(centers)
new_list = []
for x, y in est.cluster_centers_:
min_num = 10000
min_x = -1
min_y = -1
for x_, y_ in centers:
dist = distance(x, y, x_, y_)
if (dist < min_num) or (min_x == -1):
min_num = dist
min_x = x_
min_y = y_
new_list.append([min_x, min_y])
return new_list
def noise_removal(aud_sample):
if (min(abs(aud_sample)) == 0):
return aud_sample
data = abs(np.copy(aud_sample))
clf = KMeans(n_clusters = 2,n_init = 5)
data = data.reshape(-1,1)
clf.fit(data)
if clf.cluster_centers_[0] < clf.cluster_centers_[1]:
noise = 0
else:
noise = 1
aud = np.copy(aud_sample)
window = 500
windowStride = 50
for i in range(0,len(clf.labels_),windowStride):
if sum(clf.labels_[i:i+window] == noise) == window:
aud[i:i+window] = 0
return aud
def calculate():
from sklearn.metrics import mean_squared_error
import os
if not os.path.exists('plots'):
os.makedirs('plots')
for k in xrange(2, 22):
cluster = KMeans(k, init='k-means++', random_state=241)
cluster.fit(X)
reduced_image = recreate_image(cluster.cluster_centers_, cluster.labels_, h, w, d)
mse = np.mean((image - reduced_image) ** 2)
psnr = 10 * np.log10(1.0 / mse)
plot(reduced_image, "plots/plot%d.png" % (k))
print "k: %d, mse: %.2f psnr: %.2f" % (k, mse, psnr)
if psnr > 20:
return k
def evaluate_kmeans(X, model):
""" Evaluate a K-Means model that has been trained on X using the
Silhouette score.
Args:
X: the TF-IDF matrix where each line represents a document and each
column represents a word, typically obtained by running
transform_text() from the TP2.
model: the KMeans model trained on X.
Returns:
A double that corresponds to the Silhouette score of the model.
"""
return silhouette_score(X, model.labels_)
# Ex2
def agglomerative_clustering(X, k=10):
""" Run an agglomerative clustering on X.
Args:
X: the TF-IDF matrix where each line represents a document and each
column represents a word, typically obtained by running
transform_text() from the TP2.
k: the number of clusters we want (default: 10).
Returns:
An AgglomerativeClustering model trained on X.
"""
model = AgglomerativeClustering(n_clusters=k)
model.fit(X)
# Note all the other functions are the same except we use
# 'AgglomerativeClustering' instead of 'KMeans'.
return model
# Ex4.1
def cluster_kmeans(X_train, model_args=None, gridsearch=True):
from sklearn.cluster import KMeans
print('KMeans')
if gridsearch is True:
param_grid = {
'n_clusters': np.arange(1, 20, 2),
'max_iter': [50, 100, 300],
'tol': [1e-5, 1e-4, 1e-3]
}
prune(param_grid, model_args)
else:
if 'n_clusters' not in model_args:
raise KeyError('Need to define n_clusters for Birch')
param_grid = None
return ModelWrapper(KMeans, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
def getFlatVolume(series_volumes):
"""????????????? return: float"""
results = np.array(series_volumes)
results_n = np.zeros((len(results),2))
results_n[:,0] = 1
results_n[:,1] = np.array(results)
#???3?? ?????????????
k = KMeans(3)
k.fit(results_n)
df = pd.DataFrame(k.labels_)
df_c = pd.DataFrame(k.cluster_centers_)
v = []
for i in range(3):
v.append( df[df[0]==i].count()[0])
df_c[2] = v
return df_c.iloc[df_c[2].argmax()][1]
#
#????
#----------------------------------------------------------------------
def clusterFacetSamplesKNN(self, reduceRatio=3, maxNPnts=5):
"""
cluster the samples of each facet using k nearest neighbors
the cluster center and their correspondent normals will be saved
in self.objsamplepnts_refcls and self.objsamplenrmals_refcls
:param: reduceRatio: the ratio of points to reduce
:param: maxNPnts: the maximum number of points on a facet
:return: None
author: weiwei
date: 20161129, tsukuba
"""
self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
for i, facet in enumerate(self.facets):
self.objsamplepnts_refcls[i] = np.empty(shape=(0,0))
self.objsamplenrmls_refcls[i] = np.empty(shape=(0,0))
X = self.objsamplepnts_ref[i]
nX = X.shape[0]
if nX > reduceRatio:
kmeans = KMeans(n_clusters=maxNPnts if nX/reduceRatio>maxNPnts else nX/reduceRatio, random_state=0).fit(X)
self.objsamplepnts_refcls[i] = kmeans.cluster_centers_
self.objsamplenrmls_refcls[i] = np.tile(self.facetnormals[i], [self.objsamplepnts_refcls.shape[0],1])