def k_means_cluster_Predict(data_list,info):
array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
ks = list(range(1,len(info)))
KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
ks_picked=ks[BIC.index(max(BIC))]
if ks_picked==1:
return [data_list]
else:
out=[]
std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
whitened = whiten(array_diagnal)
centroids, distortion=kmeans(whitened,ks_picked)
idx,_= vq(whitened,centroids)
for x in range(ks_picked):
group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
out.append(group1)
return out
python类vq()的实例源码
def kmeans_numpy(d, headers, K, whiten=True):
# assign to A the result of getting the data from your Data object
A = d.get_data(headers)
# assign to W the result of calling vq.whiten on A
W = vq.whiten(A)
# assign to codebook, bookerror the result of calling vq.kmeans with W and K
codebook, bookerror = vq.kmeans(W, K)
# assign to codes, error the result of calling vq.vq with W and the codebook
codes, error = vq.vq(W, codebook)
# return codebook, codes, and error
return codebook, codes, error
# prep the k-means clustering algorithm by getting initial cluster means
def vector_quantize(data_dict, vs, bins):
codebooks = {}
vq_data = {}
for size in vs.keys():
all_size_data = []
for disease in vs[size]:
all_size_data.extend(data_dict[disease])
#whitened = sp.whiten(all_size_data)
#codebooks[size] = sp.kmeans(whitened, bins)[0]
codebooks[size] = sp.kmeans(np.asarray(all_size_data), bins)[0]
pickle.dump(codebooks,open("all_codebooks.pkl","wb"))
for dis in data_dict.keys():
n = len(data_dict[dis])
m = len(data_dict[dis][0])
vq_data[dis] = map(str,sp.vq(np.reshape(data_dict[dis],(n,m)), codebooks[len(data_dict[dis][0])])[0])
return vq_data
def get_histogram(self, data):
"""
Project the descriptions on to the codebook/vocabulary,
returning the histogram of words
[N x 1] => [1 x K] histogram
"""
if self.method == 'vq' or self.method == 'bow':
code = self.get_code(data)
code_hist = self.bow(data, code, self.K)
elif self.method == 'vlad':
code = self.get_code(data)
code_hist = self.vlad(data, code)
elif self.method == 'fisher':
code = self.get_code(data)
code_hist = self.fisher(data, code)
else:
raise NotImplementedError('''Histogram method %s not implemented. '''
'''Use vq/bow or vlad or fisher!''' % self.method)
return code_hist
def k_means_cluster(data_list):
if max(data_list[0])-min(data_list[0])>10 and max(data_list[1])-min(data_list[1])>10:
array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
ks = list(range(1,min([5,len(data_list[0])+1])))
KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
KMeans_predict=[cluster.KMeans(n_clusters = i, init="k-means++").fit_predict(array_diagnal) for i in ks]
BIC=[]
BIC_rec=[]
for x in ks:
if KMeans_predict[x-1].max()<x-1: continue
else:
BIC_i=compute_bic(KMeans[x-1],array_diagnal)
if abs(BIC_i)<10**8:
BIC.append(BIC_i)
BIC_rec.append(x)
#BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
#ks_picked=ks[BIC.index(max(BIC))]
ks_picked=BIC_rec[BIC.index(max(BIC))]
if ks_picked==1:
return [data_list]
else:
out=[]
std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
whitened = whiten(array_diagnal)
centroids, distortion=kmeans(whitened,ks_picked)
idx,_= vq(whitened,centroids)
for x in range(ks_picked):
group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
out.append(group1)
return out
else:
return [data_list]
def quantize_net(net, codebook):
layers = codebook.keys()
codes_W = {}
print "================Perform quantization=============="
for layer in layers:
print "Quantize layer:", layer
W = net.params[layer][0].data
codes, _ = scv.vq(W.flatten(), codebook[layer]) # ???????????
# codes = stochasitc_quantize2(W.flatten(), codebook[layer]) # ?????????
codes = np.reshape(codes, W.shape)
codes_W[layer] = np.array(codes, dtype=np.uint32)
# ?????????????
W_q = np.reshape(codebook[layer][codes], W.shape)
np.copyto(net.params[layer][0].data, W_q)
return codes_W
def recover_all(net, dir_t, idx=0):
layers = net.params.keys()
net.copy_from(dir_t + 'caffemodel%d' % idx)
codebook = pickle.load(open(dir_t + 'codebook%d' % idx))
maskCode = {}
codeDict = {}
for layer in layers:
W = net.params[layer][0].data
# ????
codes, _ = scv.vq(W.flatten(), codebook[layer])
# ????????
maskCode[layer] = np.reshape(codes, W.shape)
codeBookSize = len(codebook[layer])
a = maskCode[layer].flatten()
b = xrange(len(a))
codeDict[layer] = {}
for i in xrange(len(a)):
# codeDict????????????maskCode???????????
codeDict[layer].setdefault(a[i], []).append(b[i])
return codebook, maskCode, codeDict
def voxel_cell_vertex_extraction(img,**kwargs):
shape = np.array(img.shape)
neighborhood_img = []
for x in np.arange(-1,2):
for y in np.arange(-1,2):
for z in np.arange(-1,2):
neighborhood_img.append(img[1+x:shape[0]-1+x,1+y:shape[1]-1+y,1+z:shape[2]-1+z])
neighborhood_img = np.sort(np.transpose(neighborhood_img,(1,2,3,0))).reshape((shape-2).prod(),27)
neighborhoods = np.array(map(np.unique,neighborhood_img))
neighborhood_size = np.array(map(len,neighborhoods)).reshape(shape[0]-2,shape[1]-2,shape[2]-2)
neighborhoods = np.array(neighborhoods).reshape(shape[0]-2,shape[1]-2,shape[2]-2)
vertex_coords = np.where(neighborhood_size==4)
vertex_points = np.transpose(vertex_coords)+1
vertex_cells = np.array([p for p in neighborhoods[vertex_coords]],int)
unique_cell_vertices = array_unique(vertex_cells)
vertices_matching = vq(vertex_cells,unique_cell_vertices)[0]
unique_cell_vertex_points = np.array([np.mean(vertex_points[vertices_matching == v],axis=0) for v in xrange(len(unique_cell_vertices))])
cell_vertex_dict = dict(zip([tuple(c) for c in unique_cell_vertices],list(unique_cell_vertex_points)))
return cell_vertex_dict
def bow_histogram(data, codebook, pts=None, shape=None):
code, dist = vq(data, codebook)
code_hist = bow(data, code, codebook.shape[0])
return code_hist
def get_code(self, data):
"""
Transform the [N x D] data to [N x 1] where n_i \in {1, ... , K}
returns the cluster indices
"""
if self.quantizer == 'vq':
code, dist = vq(data, self.codebook)
elif self.quantizer == 'kdtree':
dist, code = self.index.query(data, k=1)
else:
raise NotImplementedError('Quantizer %s not implemented. Use vq or kdtree!' % self.quantizer)
return code
def quantize_net_with_dict(net, layers, codebook, use_stochastic=False, timing=False):
start_time = time.time()
codeDict = {} # ?????????????
maskCode = {} # ??????
for layer in layers:
print "Quantize layer:", layer
W = net.params[layer][0].data
if use_stochastic:
codes = stochasitc_quantize2(W.flatten(), codebook[layer])
else:
codes, _ = scv.vq(W.flatten(), codebook[layer])
W_q = np.reshape(codebook[layer][codes], W.shape)
net.params[layer][0].data[...] = W_q
maskCode[layer] = np.reshape(codes, W.shape)
codeBookSize = len(codebook[layer])
a = maskCode[layer].flatten()
b = xrange(len(a))
codeDict[layer] = {}
for i in xrange(len(a)):
codeDict[layer].setdefault(a[i], []).append(b[i])
if timing:
print "Update codebook time:%f" % (time.time() - start_time)
return codeDict, maskCode
def apply_palette(img, palette, options):
'''Apply the pallete to the given image. The first step is to set all
background pixels to the background color; then, nearest-neighbor
matching is used to map each foreground color to the closest one in
the palette.
'''
if not options.quiet:
print(' applying palette...')
bg_color = palette[0]
fg_mask = get_fg_mask(bg_color, img, options)
orig_shape = img.shape
pixels = img.reshape((-1, 3))
fg_mask = fg_mask.flatten()
num_pixels = pixels.shape[0]
labels = np.zeros(num_pixels, dtype=np.uint8)
labels[fg_mask], _ = vq(pixels[fg_mask], palette)
return labels.reshape(orig_shape[:-1])
######################################################################
def quantize(self):
clusters = range(self.centroids.shape[0] + 1)
histograms = {}
for fname in sorted(self.data.keys()):
if self.data[fname] is None: continue
idx,_ = vq(self.data[fname], self.centroids)
histograms[fname], _ = np.histogram(idx, bins=clusters, normed=self.normalize)
return histograms
def sequences(self):
sequences = {}
for fname in sorted(self.data.keys()):
if self.data[fname] is None: continue
idx,_ = vq(self.data[fname], self.centroids)
sequences[fname] = idx
return sequences
def triangle_topomesh(triangles, positions, **kwargs):
triangles = np.array(triangles)
positions = array_dict(positions)
edges = array_unique(np.sort(np.concatenate(triangles[:,triangle_edge_list],axis=0)))
triangle_edges = np.sort(np.concatenate(triangles[:,triangle_edge_list]))
start_time = time()
print "--> Generating triangle topomesh"
triangle_edge_matching = vq(triangle_edges,edges)[0]
triangle_topomesh = PropertyTopomesh(3)
for c in np.unique(triangles):
triangle_topomesh.add_wisp(0,c)
for e in edges:
eid = triangle_topomesh.add_wisp(1)
for pid in e:
triangle_topomesh.link(1,eid,pid)
for t in triangles:
fid = triangle_topomesh.add_wisp(2)
for eid in triangle_edge_matching[3*fid:3*fid+3]:
triangle_topomesh.link(2,fid,eid)
triangle_topomesh.add_wisp(3,0)
for fid in triangle_topomesh.wisps(2):
triangle_topomesh.link(3,0,fid)
triangle_topomesh.update_wisp_property('barycenter',0,positions.values(np.unique(triangles)),keys=np.unique(triangles))
end_time = time()
print "<-- Generating triangle topomesh [",end_time-start_time,"s]"
return triangle_topomesh
def quad_topomesh(quads, positions, faces_as_cells=False, **kwargs):
quads = np.array(quads)
positions = array_dict(positions)
edges = array_unique(np.sort(np.concatenate(quads[:,quad_edge_list],axis=0)))
quad_edges = np.sort(np.concatenate(quads[:,quad_edge_list]))
start_time = time()
print "--> Generating quad topomesh"
quad_edge_matching = vq(quad_edges,edges)[0]
quad_topomesh = PropertyTopomesh(3)
for c in np.unique(quads):
quad_topomesh.add_wisp(0,c)
for e in edges:
eid = quad_topomesh.add_wisp(1)
for pid in e:
quad_topomesh.link(1,eid,pid)
for q in quads:
fid = quad_topomesh.add_wisp(2)
for eid in quad_edge_matching[4*fid:4*fid+4]:
quad_topomesh.link(2,fid,eid)
if not faces_as_cells:
quad_topomesh.add_wisp(3,0)
for fid in quad_topomesh.wisps(2):
quad_topomesh.link(3,0,fid)
else:
for fid in quad_topomesh.wisps(2):
quad_topomesh.add_wisp(3,fid)
quad_topomesh.link(3,fid,fid)
quad_topomesh.update_wisp_property('barycenter',0,positions.values(np.unique(quads)),keys=np.unique(quads))
end_time = time()
print "<-- Generating quad topomesh [",end_time-start_time,"s]"
return quad_topomesh
def implicit_surface(density_field,size,resolution,iso=0.5):
import numpy as np
from scipy.cluster.vq import kmeans, vq
from openalea.container import array_dict
from skimage.measure import marching_cubes
surface_points, surface_triangles = marching_cubes(density_field,iso)
surface_points = (np.array(surface_points))*(size*resolution/np.array(density_field.shape)) - size*resolution/2.
points_ids = np.arange(len(surface_points))
points_to_delete = []
for p,point in enumerate(surface_points):
matching_points = np.sort(np.where(vq(surface_points,np.array([point]))[1] == 0)[0])
if len(matching_points) > 1:
points_to_fuse = matching_points[1:]
for m_p in points_to_fuse:
surface_triangles[np.where(surface_triangles==m_p)] = matching_points[0]
points_to_delete.append(m_p)
points_to_delete = np.unique(points_to_delete)
print len(points_to_delete),"points deleted"
surface_points = np.delete(surface_points,points_to_delete,0)
points_ids = np.delete(points_ids,points_to_delete,0)
surface_triangles = array_dict(np.arange(len(surface_points)),points_ids).values(surface_triangles)
for p,point in enumerate(surface_points):
matching_points = np.where(vq(surface_points,np.array([point]))[1] == 0)[0]
if len(matching_points) > 1:
print p,point
raw_input()
triangles_to_delete = []
for t,triangle in enumerate(surface_triangles):
if len(np.unique(triangle)) < 3:
triangles_to_delete.append(t)
# elif triangle.max() >= len(surface_points):
# triangles_to_delete.append(t)
surface_triangles = np.delete(surface_triangles,triangles_to_delete,0)
return surface_points, surface_triangles
def performance_measure(reference_set,experimental_set,measure='jaccard_index'):
VP = (vq(experimental_set,reference_set)[1]==0).sum()
FP = (vq(experimental_set,reference_set)[1]>0).sum()
FN = (vq(reference_set,experimental_set)[1]>0).sum()
if measure == 'true_positive':
return VP
elif measure == 'precision':
return VP/float(VP+FP)
elif measure == 'recall':
return VP/float(VP+FN)
elif measure == 'dice_index':
return 2*VP / float(2*VP+FP+FN)
elif measure == 'jaccard_index':
return VP/float(VP+FP+FN)
def jaccard_index(reference_set,experimental_set):
VP = (vq(experimental_set,reference_set)[1]==0).sum()
FP = (vq(experimental_set,reference_set)[1]>0).sum()
FN = (vq(reference_set,experimental_set)[1]>0).sum()
return VP/float(VP+FP+FN)
def density_plot(figure,X,Y,color,xlabel="",ylabel="",n_points=10,linewidth=1,marker_size=40.,alpha=1.0,label=""):
font = fm.FontProperties(family = 'Trebuchet', weight ='light')
#font = fm.FontProperties(family = 'CenturyGothic',fname = '/Library/Fonts/Microsoft/Century Gothic', weight ='light')
figure.patch.set_facecolor('white')
axes = figure.add_subplot(111)
# axes.plot(X,Y,linewidth=1,color=tuple(color2),alpha=0.2)
# ratios = (Y-Y.min())/(Y.max()-Y.min())
# X_min = X.mean()-3*X.std()
# X_max = X.mean()+3*X.std()
X_min = np.percentile(X,100/n_points)
X_max = np.percentile(X,100 - 100/n_points)
Y_min = np.percentile(Y,100/n_points)
# Y_min = Y.mean()-3*Y.std()
Y_max = np.percentile(Y,100 - 100/n_points)
X_grid = np.linspace(X_min,X_max,n_points)
Y_grid = np.linspace(Y_min,Y_max,n_points)
X_sampled = X_grid[vq(X,X_grid)[0]]
Y_sampled = Y_grid[vq(Y,Y_grid)[0]]
point_density = {}
for x in np.unique(X_sampled):
point_count = nd.sum(np.ones_like(np.where(X_sampled==x)),Y_sampled[np.where(X_sampled==x)],index=np.unique(Y_sampled))
for i,y in enumerate(np.unique(Y_sampled)):
point_density[(x,y)] = point_count[i]/len(Y)
point_area = np.array([np.pi*10.0*marker_size*point_density[(x,y)]/np.array(point_density.values()).max() for x,y in zip(X_sampled,Y_sampled)])
#colors = np.random.rand(len(X))
colors = np.array([point_density[(x,y)]/np.array(point_density.values()).max() * color for x,y in zip(X_sampled,Y_sampled)])
colors += np.array([(1-point_density[(x,y)]/np.array(point_density.values()).max()) * np.ones(3) for x,y in zip(X_sampled,Y_sampled)])
axes.scatter(X_sampled,Y_sampled,s=point_area,c=colors,linewidth=linewidth,alpha=alpha,label=label)
axes.set_xlim(X_min,X_max)
axes.set_xlabel(xlabel,fontproperties=font, size=10, style='italic')
axes.set_xticklabels(axes.get_xticks(),fontproperties=font, size=12)
axes.set_ylim(Y_min,Y_max)
axes.set_ylabel(ylabel, fontproperties=font, size=10, style='italic')
axes.set_yticklabels(axes.get_yticks(),fontproperties=font, size=12)
def kmeans(x, k):
centroids, dist = _kmeans(x, k)
idx, _ = vq(x,centroids)
return idx, centroids, dist
def kmeans(d, headers, K, metric, whiten=True, categories=None):
'''Takes in a Data object, a set of headers, and the number of clusters to create
Computes and returns the codebook, codes and representation errors.
If given an Nx1 matrix of categories, it uses the category labels
to calculate the initial cluster means.
'''
# assign to A the result getting the data given the headers
try:
A = d.get_data(headers)
except AttributeError:
A = d
if whiten:
W = vq.whiten(A)
else:
W = A
codebook = kmeans_init(W, K, categories)
# assign to codebook, codes, errors, the result of calling kmeans_algorithm with W and codebook
codebook, codes, errors = kmeans_algorithm(W, codebook, metric)
# return the codebook, codes, and representation error
return codebook, codes, errors
# test function
def process_audio(self, isTraining, sound_file):
""" Takes in a wav file and outputs labeled observations of the audio
isTraining: bool that is true if the model is being trained
"""
(rate, sig) = wav.read(sound_file)
sig = sig.astype(np.float64)
# MFCC Features. Each row corresponds to MFCC for a frame
mfcc_feat = mfcc(sig, rate)
labeled_obs = vq(mfcc_feat, self.codebook)[0]
self.voice_obs = labeled_obs
def find_dominant_colors(image):
"""Cluster the colors of the image in CLUSTER_NUMBER of clusters. Returns
an array of dominant colors reverse sorted by cluster size.
"""
array = img_as_float(fromimage(image))
# Reshape from MxNx4 to Mx4 array
array = array.reshape(scipy.product(array.shape[:2]), array.shape[2])
# Remove transparent pixels if any (channel 4 is alpha)
if array.shape[-1] > 3:
array = array[array[:, 3] == 1]
# Finding centroids (centroids are colors)
centroids, _ = kmeans(array, CLUSTER_NUMBER)
# Allocate pixel to a centroid cluster
observations, _ = vq(array, centroids)
# Calculate the number of pixels in a cluster
histogram, _ = scipy.histogram(observations, len(centroids))
# Sort centroids by number of pixels in their cluster
sorted_centroids = sorted(zip(centroids, histogram),
key=lambda x: x[1],
reverse=True)
sorted_colors = tuple((couple[0] for couple in sorted_centroids))
return sorted_colors
def kmeans_classify(features, shape, label=True, fill=False):
"""Run the k-means algorithm."""
print("Starting kmeans")
whitened = whiten(features)
init = np.array((whitened.min(0), whitened.mean(0), whitened.max(0)))
codebook, _ = kmeans(whitened, init)
classified, _ = vq(whitened, codebook)
print("Finished kmeans")
return classified
def testHMMs(inp_file="test_input.txt"):
inp_vals = map(float, open("test_input.txt","r").read().strip().split(','))
HMMs = pickle.load(open("trained_HMMs_saved.pkl","rb"))
codebooks = pickle.load(open("all_codebooks.pkl","rb"))
vs = pickle.load(open("size_mapping.pkl","rb"))
results = {}
for size in vs.keys():
# organize input data into vectors of particular size
c = 0
vecs = []
for i in range(len(inp_vals)/int(size)):
vecs.append(inp_vals[c:c+size])
c += size
#print vecs
# Vector Quantizing
n = len(vecs)
vq_seq = map(str,sp.vq(np.reshape(vecs,(n,size)), codebooks[size])[0])
if len(vq_seq) > 0:
diseases = vs[size]
for disease in diseases:
HMM_obj = HMM("initial.json")
HMM_obj.A = HMMs[disease]["A"]
HMM_obj.B = HMMs[disease]["B"]
HMM_obj.pi = HMMs[disease]["pi"]
prob = HMM_obj.forward_scaled(vq_seq)
results[disease] = math.exp(prob)
for i in HMMs.keys():
if not results.has_key(i):
results[i] = "Not Enough Data to Predict"
print "RESULTS:"
for dis in results.keys():
print dis.capitalize()+": "+str(results[dis])
def cal_vlad(self, descriptors, centers):
if self.flag is False:
self.load()
if self.stdSlr is not None:
descriptors = self.stdSlr.transform(descriptors)
dimensions = descriptors[0].shape[0]
vlad_vector = np.zeros((len(centers),dimensions), dtype=np.float32)
center_idx, distance = vq(descriptors, centers)
for i,idx in enumerate(center_idx):
vlad_vector[idx] += (descriptors[i] - centers[idx])
vlad_vector = cv2.normalize(vlad_vector)
vlad_vector = vlad_vector.flatten()
return vlad_vector
def flann(cls, feature, words):
return vq(feature, words)
def flann(cls, feature, words):
return vq(feature, words)
def k_means_clustering(instance_array, n_clusters=9, sin_cos = 1, number_of_starts = 30, seed=None, use_scikit=1,**kwargs):
'''
This runs the k-means clustering algorithm as implemented in scipy - change to scikit-learn?
SH: 7May2013
'''
from sklearn.cluster import KMeans
print 'starting kmeans algorithm, k=%d, retries : %d, sin_cos = %d'%(n_clusters,number_of_starts,sin_cos)
if sin_cos==1:
print ' using sine and cosine of the phases'
sin_cos_instances = np.zeros((instance_array.shape[0],instance_array.shape[1]*2),dtype=float)
sin_cos_instances[:,::2]=np.cos(instance_array)
sin_cos_instances[:,1::2]=np.sin(instance_array)
input_array = sin_cos_instances
#code_book,distortion = vq.kmeans(sin_cos_instances, n_clusters,iter=number_of_starts)
#cluster_assignments, point_distances = vq.vq(sin_cos_instances, code_book)
else:
print ' using raw phases'
input_array = instance_array
#code_book,distortion = vq.kmeans(instance_array, n_clusters,iter=number_of_starts)
#cluster_assignments, point_distances = vq.vq(instance_array, code_book)
#pickle.dump(multiple_run_results,file(k_means_output_filename,'w'))
if use_scikit:
print 'using scikit learn'
tmp = KMeans(init='k-means++', n_clusters=n_clusters, n_init = number_of_starts, n_jobs=1, random_state = seed)
cluster_assignments = tmp.fit_predict(input_array)
code_book = tmp.cluster_centers_
else:
print 'using vq from scipy'
code_book,distortion = vq.kmeans(input_array, n_clusters,iter=number_of_starts)
cluster_assignments, point_distances = vq.vq(input_array, code_book)
if sin_cos:
cluster_details = {'k_means_centroids_sc':code_book}
else:
cluster_details = {'k_means_centroids':code_book}
return cluster_assignments, cluster_details
##################################################################################
#############################k-means periodic algorithm##############################