def kmeans_numpy(d, headers, K, whiten=True):
# assign to A the result of getting the data from your Data object
A = d.get_data(headers)
# assign to W the result of calling vq.whiten on A
W = vq.whiten(A)
# assign to codebook, bookerror the result of calling vq.kmeans with W and K
codebook, bookerror = vq.kmeans(W, K)
# assign to codes, error the result of calling vq.vq with W and the codebook
codes, error = vq.vq(W, codebook)
# return codebook, codes, and error
return codebook, codes, error
# prep the k-means clustering algorithm by getting initial cluster means
python类kmeans()的实例源码
def vector_quantize(data_dict, vs, bins):
codebooks = {}
vq_data = {}
for size in vs.keys():
all_size_data = []
for disease in vs[size]:
all_size_data.extend(data_dict[disease])
#whitened = sp.whiten(all_size_data)
#codebooks[size] = sp.kmeans(whitened, bins)[0]
codebooks[size] = sp.kmeans(np.asarray(all_size_data), bins)[0]
pickle.dump(codebooks,open("all_codebooks.pkl","wb"))
for dis in data_dict.keys():
n = len(data_dict[dis])
m = len(data_dict[dis][0])
vq_data[dis] = map(str,sp.vq(np.reshape(data_dict[dis],(n,m)), codebooks[len(data_dict[dis][0])])[0])
return vq_data
def build(self, A, categories, K=None):
'''Builds the classifier give the data points in A and the categories'''
# figure out how many categories there are and get the mapping (np.unique)
unique, mapping = np.unique(np.array(categories.T), return_inverse=True)
self.num_classes = len(unique)
self.num_features = A.shape[0]
self.categories = categories
# for each category i, build the set of exemplars
for i in range(self.num_classes):
if K is None:
self.exemplars.append(A[(mapping == i),:])
else:
codebook,codes = vq.kmeans(A[(mapping == i),:],K)
self.exemplars.append(codebook)
return
def seperate_via_kmeans(state_space,p,K,tau=0.1):
from scipy.cluster.vq import kmeans
#centres= np.floor(kmeans2(state_space,K)[0]) # these are the original lines
# the following are being added as hacks
#_all_cores_filled_ = False
#while(_all_cores_filled_ == False):
#centres, distributed = kmeans(state_space,K)
#print("going into k means" + "we only have " + str(np.max(distributed)))
#if np.max(distributed) == K-1:
#_all_cores_filled_ = True
# bhack to make just the K means work
centres, stuff = kmeans(state_space,K)
# hack ends here
#proportions = partition_algo_distances(state_space,centres,tau)
proportions = partition_algo_distances_tight(state_space,centres,tau)
sub_state_space, sub_prob = seperate_via_proportions(state_space,proportions, p)
return sub_state_space, sub_prob, centres
test_clustering.py 文件源码
项目:pymatgen-diffusion
作者: materialsvirtuallab
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def test_cluster(self):
data = np.random.uniform(size=(10, 5))
data = list(data)
d2 = np.random.uniform(size=(10, 5)) + ([5] * 5)
data.extend(list(d2))
d2 = np.random.uniform(size=(10, 5)) + ([-5] * 5)
data.extend(list(d2))
data = np.array(data)
k = Kmeans()
clusters = []
for i in range(10):
clusters.append(k.cluster(data, 3))
c1, l1, ss = min(clusters, key=lambda d: d[2])
c2, d = kmeans(data, 3)
same = False
for a in itertools.permutations(c2):
if np.allclose(c1, a):
same = True
break
self.assertTrue(same)
def kmeans(X, K):
km = KMeans(K).fit(X)
return km.cluster_centers_
def kmeans(X, K):
return vq.kmeans(X, K)[0]
def kmeans_net(net, layers, num_c=16, initials=None):
# net: ??
# layers: ??????
# num_c: ???????
# initials: ??????
codebook = {} # ????
if type(num_c) == type(1):
num_c = [num_c] * len(layers)
else:
assert len(num_c) == len(layers)
# ?????????
print "==============Perform K-means============="
for idx, layer in enumerate(layers):
print "Eval layer:", layer
W = net.params[layer][0].data.flatten()
W = W[np.where(W != 0)] # ????0???
# ?????????????????
if initials is None: # Default: uniform sample
min_W = np.min(W)
max_W = np.max(W)
initial_uni = np.linspace(min_W, max_W, num_c[idx] - 1)
codebook[layer], _ = scv.kmeans(W, initial_uni)
elif type(initials) == type(np.array([])):
codebook[layer], _ = scv.kmeans(W, initials)
elif initials == 'random':
codebook[layer], _ = scv.kmeans(W, num_c[idx] - 1)
else:
raise Exception
# ?0?????
codebook[layer] = np.append(0.0, codebook[layer])
print "codebook size:", len(codebook[layer])
return codebook
# ???????
def get_palette(samples, options, return_mask=False, kmeans_iter=40):
'''Extract the palette for the set of sampled RGB values. The first
palette entry is always the background color; the rest are determined
from foreground pixels by running K-means clustering. Returns the
palette, as well as a mask corresponding to the foreground pixels.
'''
if not options.quiet:
print(' getting palette...')
bg_color = get_bg_color(samples, 6)
fg_mask = get_fg_mask(bg_color, samples, options)
centers, _ = kmeans(samples[fg_mask].astype(np.float32),
options.num_colors-1,
iter=kmeans_iter)
palette = np.vstack((bg_color, centers)).astype(np.uint8)
if not return_mask:
return palette
else:
return palette, fg_mask
######################################################################
def relevant_moods(song):
"""
:param song: single song document, taken from a previously filled queue
:return moods: list of relevant moods
"""
all_moods = song['moods']
if len(all_moods) == 0:
return None
ordered_score_moods = sorted(all_moods.items(), key=lambda x: x[1], reverse=True)
# pprint(ordered_score_moods)
ordered_scores = sorted(all_moods.values(), reverse=True)
# print(ordered_scores)
features = np.asarray(ordered_scores)
codebook, distortion = kmeans(features, 2)
codebook = sorted(codebook.tolist(), reverse=True)
moods = []
for m in ordered_score_moods:
mood = m[0]
score = m[1]
if len(codebook) > 1:
if abs(score - codebook[0]) < abs(score - codebook[1]):
moods.append(mood)
else:
continue
else:
moods.append(mood)
# print(moods)
return moods
# ### Quering the DB to retrieve the songs moods
# create a shared queue for all songs
def kmeans(x, k):
centroids, dist = _kmeans(x, k)
idx, _ = vq(x,centroids)
return idx, centroids, dist
def kmeans(d, headers, K, metric, whiten=True, categories=None):
'''Takes in a Data object, a set of headers, and the number of clusters to create
Computes and returns the codebook, codes and representation errors.
If given an Nx1 matrix of categories, it uses the category labels
to calculate the initial cluster means.
'''
# assign to A the result getting the data given the headers
try:
A = d.get_data(headers)
except AttributeError:
A = d
if whiten:
W = vq.whiten(A)
else:
W = A
codebook = kmeans_init(W, K, categories)
# assign to codebook, codes, errors, the result of calling kmeans_algorithm with W and codebook
codebook, codes, errors = kmeans_algorithm(W, codebook, metric)
# return the codebook, codes, and representation error
return codebook, codes, errors
# test function
def cluster(matrix):
whitened = whiten(matrix.todense())
# for x in range(25, 40):
# means, distortion = kmeans(whitened, x)
# print distortion
means, distortion = kmeans(whitened, 30)
# pickle.dump(means, open('30means-' + sys.argv[1] + '.pkl', 'wb'))
return means, distortion
def k_means(points, k, **kwargs):
'''
Find k centroids that attempt to minimize the k- means problem:
https://en.wikipedia.org/wiki/Metric_k-center
Arguments
----------
points: (n, d) set of points
k: int, number of centroids to compute
**kwargs: passed directly to scipy.cluster.vq.kmeans
Returns
----------
centroids: (k, d) set of points
labels: (n) set of indexes for which points belong to which centroid
'''
from scipy.cluster.vq import kmeans
from scipy.spatial import cKDTree
points = np.asanyarray(points)
points_std = points.std(axis=0)
whitened = points / points_std
centroids_whitened, distortion = kmeans(whitened, k, **kwargs)
centroids = centroids_whitened * points_std
tree = cKDTree(centroids)
labels = tree.query(points, k=1)[1]
return centroids, labels
def find_dominant_colors(image):
"""Cluster the colors of the image in CLUSTER_NUMBER of clusters. Returns
an array of dominant colors reverse sorted by cluster size.
"""
array = img_as_float(fromimage(image))
# Reshape from MxNx4 to Mx4 array
array = array.reshape(scipy.product(array.shape[:2]), array.shape[2])
# Remove transparent pixels if any (channel 4 is alpha)
if array.shape[-1] > 3:
array = array[array[:, 3] == 1]
# Finding centroids (centroids are colors)
centroids, _ = kmeans(array, CLUSTER_NUMBER)
# Allocate pixel to a centroid cluster
observations, _ = vq(array, centroids)
# Calculate the number of pixels in a cluster
histogram, _ = scipy.histogram(observations, len(centroids))
# Sort centroids by number of pixels in their cluster
sorted_centroids = sorted(zip(centroids, histogram),
key=lambda x: x[1],
reverse=True)
sorted_colors = tuple((couple[0] for couple in sorted_centroids))
return sorted_colors
def run_kmeans(img):
"""Run kmeans and plot result."""
features, shape = get_features(img)
classified = kmeans_classify(features, shape)
indices, num_objs = label(classified, shape)
plot_classes(indices, num_objs)
globpos = find_point_objects(img.lat, img.lon, indices, num_objs)
return globpos
def kmeans_classify(features, shape, label=True, fill=False):
"""Run the k-means algorithm."""
print("Starting kmeans")
whitened = whiten(features)
init = np.array((whitened.min(0), whitened.mean(0), whitened.max(0)))
codebook, _ = kmeans(whitened, init)
classified, _ = vq(whitened, codebook)
print("Finished kmeans")
return classified
def k_means_clustering(instance_array, n_clusters=9, sin_cos = 1, number_of_starts = 30, seed=None, use_scikit=1,**kwargs):
'''
This runs the k-means clustering algorithm as implemented in scipy - change to scikit-learn?
SH: 7May2013
'''
from sklearn.cluster import KMeans
print 'starting kmeans algorithm, k=%d, retries : %d, sin_cos = %d'%(n_clusters,number_of_starts,sin_cos)
if sin_cos==1:
print ' using sine and cosine of the phases'
sin_cos_instances = np.zeros((instance_array.shape[0],instance_array.shape[1]*2),dtype=float)
sin_cos_instances[:,::2]=np.cos(instance_array)
sin_cos_instances[:,1::2]=np.sin(instance_array)
input_array = sin_cos_instances
#code_book,distortion = vq.kmeans(sin_cos_instances, n_clusters,iter=number_of_starts)
#cluster_assignments, point_distances = vq.vq(sin_cos_instances, code_book)
else:
print ' using raw phases'
input_array = instance_array
#code_book,distortion = vq.kmeans(instance_array, n_clusters,iter=number_of_starts)
#cluster_assignments, point_distances = vq.vq(instance_array, code_book)
#pickle.dump(multiple_run_results,file(k_means_output_filename,'w'))
if use_scikit:
print 'using scikit learn'
tmp = KMeans(init='k-means++', n_clusters=n_clusters, n_init = number_of_starts, n_jobs=1, random_state = seed)
cluster_assignments = tmp.fit_predict(input_array)
code_book = tmp.cluster_centers_
else:
print 'using vq from scipy'
code_book,distortion = vq.kmeans(input_array, n_clusters,iter=number_of_starts)
cluster_assignments, point_distances = vq.vq(input_array, code_book)
if sin_cos:
cluster_details = {'k_means_centroids_sc':code_book}
else:
cluster_details = {'k_means_centroids':code_book}
return cluster_assignments, cluster_details
##################################################################################
#############################k-means periodic algorithm##############################
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def fetch_fruitspeech_softmax():
fs, d, wav_names = fetch_sample_speech_fruit()
def matcher(name):
return name.split("/")[1]
classes = [matcher(wav_name) for wav_name in wav_names]
all_chars = [c for c in sorted(list(set("".join(classes))))]
char2code = {v: k for k, v in enumerate(all_chars)}
vocabulary_size = len(char2code.keys())
y = []
for n, cl in enumerate(classes):
y.append(tokenize_ind(cl, char2code))
# Is it kosher to kmeans on all the data?
X, _apply, _re = apply_lpc_softmax_preproc(d)
"""
for n, Xi in enumerate(X[::8]):
di = _re(Xi)
wavfile.write("t_%i.wav" % n, fs, soundsc(di))
raise ValueError()
"""
speech = {}
speech["vocabulary_size"] = vocabulary_size
speech["vocabulary"] = char2code
speech["sample_rate"] = fs
speech["data"] = X
speech["target"] = y
speech["reconstruct"] = _re
return speech
def kmeans_clustering (vectorLayer, attributesList, normalize, clusterNumber, outputFieldName):
from scipy.cluster.vq import kmeans,vq
from numpy import array
fullObjectsList = []
features = vectorLayer.getFeatures()
for feature in features:
fullObjectsList.append([])
for attribute in attributesList:
if feature[attribute[0]]:
fullObjectsList[len(fullObjectsList)-1].append(feature[attribute[0]])
else:
fullObjectsList[len(fullObjectsList)-1].append(0)
#NORMALIZING
if normalize:
i = 0
maxValues = []
while i < len(attributesList):
maxValues.append(max(abs(item[i]) for item in fullObjectsList))
i += 1
j = 0
while j < len(fullObjectsList):
i = 0
while i < len(fullObjectsList[j]):
fullObjectsList[j][i] = (fullObjectsList[j][i] * 1.0) / (maxValues[i] * 1.0)
i += 1
j += 1
data = array(fullObjectsList)
centroids,_ = kmeans(data, clusterNumber, 25)
idx,_ = vq(data,centroids)
idx = idx.tolist()
vectorLayerDataProvider = vectorLayer.dataProvider()
# Create field of not exist
if vectorLayer.fieldNameIndex(outputFieldName) == -1:
vectorLayerDataProvider.addAttributes([QgsField(outputFieldName, QVariant.Int)])
vectorLayer.updateFields()
vectorLayer.startEditing()
attrIdx = vectorLayer.fieldNameIndex(outputFieldName)
features = vectorLayer.getFeatures()
i = 0
for feature in features:
vectorLayer.changeAttributeValue(feature.id(), attrIdx, int(idx[i]))
i += 1
vectorLayer.updateFields()
vectorLayer.commitChanges()