def k_means_cluster_Predict(data_list,info):
array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
ks = list(range(1,len(info)))
KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
ks_picked=ks[BIC.index(max(BIC))]
if ks_picked==1:
return [data_list]
else:
out=[]
std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
whitened = whiten(array_diagnal)
centroids, distortion=kmeans(whitened,ks_picked)
idx,_= vq(whitened,centroids)
for x in range(ks_picked):
group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
out.append(group1)
return out
python类whiten()的实例源码
def kmeans_numpy(d, headers, K, whiten=True):
# assign to A the result of getting the data from your Data object
A = d.get_data(headers)
# assign to W the result of calling vq.whiten on A
W = vq.whiten(A)
# assign to codebook, bookerror the result of calling vq.kmeans with W and K
codebook, bookerror = vq.kmeans(W, K)
# assign to codes, error the result of calling vq.vq with W and the codebook
codes, error = vq.vq(W, codebook)
# return codebook, codes, and error
return codebook, codes, error
# prep the k-means clustering algorithm by getting initial cluster means
def vector_quantize(data_dict, vs, bins):
codebooks = {}
vq_data = {}
for size in vs.keys():
all_size_data = []
for disease in vs[size]:
all_size_data.extend(data_dict[disease])
#whitened = sp.whiten(all_size_data)
#codebooks[size] = sp.kmeans(whitened, bins)[0]
codebooks[size] = sp.kmeans(np.asarray(all_size_data), bins)[0]
pickle.dump(codebooks,open("all_codebooks.pkl","wb"))
for dis in data_dict.keys():
n = len(data_dict[dis])
m = len(data_dict[dis][0])
vq_data[dis] = map(str,sp.vq(np.reshape(data_dict[dis],(n,m)), codebooks[len(data_dict[dis][0])])[0])
return vq_data
def k_means_cluster(data_list):
if max(data_list[0])-min(data_list[0])>10 and max(data_list[1])-min(data_list[1])>10:
array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
ks = list(range(1,min([5,len(data_list[0])+1])))
KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
KMeans_predict=[cluster.KMeans(n_clusters = i, init="k-means++").fit_predict(array_diagnal) for i in ks]
BIC=[]
BIC_rec=[]
for x in ks:
if KMeans_predict[x-1].max()<x-1: continue
else:
BIC_i=compute_bic(KMeans[x-1],array_diagnal)
if abs(BIC_i)<10**8:
BIC.append(BIC_i)
BIC_rec.append(x)
#BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
#ks_picked=ks[BIC.index(max(BIC))]
ks_picked=BIC_rec[BIC.index(max(BIC))]
if ks_picked==1:
return [data_list]
else:
out=[]
std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
whitened = whiten(array_diagnal)
centroids, distortion=kmeans(whitened,ks_picked)
idx,_= vq(whitened,centroids)
for x in range(ks_picked):
group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
out.append(group1)
return out
else:
return [data_list]
def get_mfcc_feat(self):
# creating codebook with all models
mfcc_feats = None
for filename in glob.iglob('../data/voices/*.wav'):
print filename
(rate, sig) = wav.read(filename)
# MFCC Features. Each row corresponds to MFCC for a frame
mfcc_person = mfcc(sig.astype(np.float64), rate)
if mfcc_feats is None:
mfcc_feats = mfcc_person
else:
mfcc_feats = np.concatenate((mfcc_feats, mfcc_person), axis=0)
# Normalize the features
whitened = whiten(mfcc_feats)
self.codebook, labeled_obs = kmeans2(data=whitened, k=3)
def argparser():
try:
import argparse
except ImportError:
import compat.argparse as argparse
ap=argparse.ArgumentParser()
ap.add_argument('vectors', nargs=1, metavar='FILE', help='word vectors')
ap.add_argument('-a', '--approximate', default=False, action='store_true',
help='filter by approximate similarity (with -t)')
ap.add_argument('-i', '--min-index', default=0, type=int,
help='index of first word (default 0)')
ap.add_argument('-M', '--metric', default=DEFAULT_METRIC,
choices=sorted(metrics.keys()),
help='distance metric to apply')
ap.add_argument('-n', '--normalize', default=False, action='store_true',
help='normalize vectors to unit length')
ap.add_argument('-r', '--max-rank', metavar='INT', default=None,
type=int, help='only consider r most frequent words')
ap.add_argument('-t', '--threshold', metavar='FLOAT', default=None,
type=float, help='only output distances <= t')
ap.add_argument('-T', '--tolerance', metavar='FLOAT', default=0.1,
type=float, help='approximation tolerace (with -a)')
ap.add_argument('-w', '--whiten', default=False, action='store_true',
help='normalize features to unit variance ')
ap.add_argument('-W', '--words', default=False, action='store_true',
help='output words instead of indices')
return ap
def process_options(args):
options = argparser().parse_args(args)
if options.max_rank is not None and options.max_rank < 1:
raise ValueError('max-rank must be >= 1')
if options.threshold is not None and options.threshold < 0.0:
raise ValueError('threshold must be >= 0')
if options.tolerance is not None and options.tolerance < 0.0:
raise ValueError('tolerance must be >= 0')
if options.approximate and not options.threshold:
raise ValueError('approximate only makes sense with a threshold')
if options.approximate and options.metric != 'cosine':
raise NotImplementedError('approximate only supported for cosine')
wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)
if options.normalize:
logging.info('normalize vectors to unit length')
wv.normalize()
words, vectors = wv.words(), wv.vectors()
if options.whiten:
# whitening should be implemented in wvlib to support together with
# approximate similarity
if options.approximate:
raise NotImplemenedError
logging.info('normalize features to unit variance')
vectors = whiten(vectors)
return words, vectors, wv, options
def kmeans(d, headers, K, metric, whiten=True, categories=None):
'''Takes in a Data object, a set of headers, and the number of clusters to create
Computes and returns the codebook, codes and representation errors.
If given an Nx1 matrix of categories, it uses the category labels
to calculate the initial cluster means.
'''
# assign to A the result getting the data given the headers
try:
A = d.get_data(headers)
except AttributeError:
A = d
if whiten:
W = vq.whiten(A)
else:
W = A
codebook = kmeans_init(W, K, categories)
# assign to codebook, codes, errors, the result of calling kmeans_algorithm with W and codebook
codebook, codes, errors = kmeans_algorithm(W, codebook, metric)
# return the codebook, codes, and representation error
return codebook, codes, errors
# test function
def cluster(matrix):
whitened = whiten(matrix.todense())
# for x in range(25, 40):
# means, distortion = kmeans(whitened, x)
# print distortion
means, distortion = kmeans(whitened, 30)
# pickle.dump(means, open('30means-' + sys.argv[1] + '.pkl', 'wb'))
return means, distortion
def kmeans_classify(features, shape, label=True, fill=False):
"""Run the k-means algorithm."""
print("Starting kmeans")
whitened = whiten(features)
init = np.array((whitened.min(0), whitened.mean(0), whitened.max(0)))
codebook, _ = kmeans(whitened, init)
classified, _ = vq(whitened, codebook)
print("Finished kmeans")
return classified