def _build_gmm(self, data):
"""
Build gmm from data
"""
st = time.time()
self.gmm = GMM(n_components=self.K, covariance_type='diag')
self.gmm.fit(data)
# Setup codebook for closest center lookup
self.codebook = self.gmm.means_
print 'Vocab construction from data %s (%s KB, %s) => GMM %s took %5.3f s' % \
(data.shape, data.nbytes / 1024, data.dtype, self.gmm.means_.shape, time.time() - st)
print 'GMM: %s' % ('GOOD' if np.isfinite(self.gmm.means_).all() else 'BAD')
# Save codebook, and index
self.index_codebook()
python类GMM的实例源码
def find_centers_gmm(data, gridpoints, n_components=2):
r""" Provided 1D data and a grid of points, return the indices of the points closest to
the centers of an assumed n-modal distribution behind "data"
data : 1D data ndarray, of shape (N, 1)
gridpoints: 1D gridpoints
returns: idxs, igmm
idxs: 1D ndarray
INDICES of gridpoints that are closest to the centers of the gaussians
igmm: gaussian mixture model (sklearn type)
tested = true
"""
igmm = _GMM(n_components=n_components)
igmm.fit(data)
return _np.abs(gridpoints-_np.sort(igmm.means_, 0)).argmin(1), igmm
def parserArguments(parser):
parser.add_argument('--max_descriptors', nargs='*',
type=int, default=[150000],
help='load maximum descriptors')
parser.add_argument('--num_clusters', type=int, default=100,\
help='number of cluster-centers = size of vocabulary')
parser.add_argument('--vocabulary_filename', default='ubm',\
help='write vocabulary to this file')
parser.add_argument('--method', default='gmm',\
choices=['gmm'],
help=('method for clustering'))
parser.add_argument('--iterations', type=int, default=100,\
help=' number of iterations (if gmm, this is the gmm '
'part, not the kmeans-initialization part)')
parser.add_argument('--update', default='wmc',\
help='what to update w. GMM, w:weights, m:means, c:covars')
parser.add_argument('--covar_type', default='diag',
choices=['full','diag'],
help='covariance type for gmm')
return parser
def test_GMM_attributes():
n_components, n_features = 10, 4
covariance_type = 'diag'
g = mixture.GMM(n_components, covariance_type, random_state=rng)
weights = rng.rand(n_components)
weights = weights / weights.sum()
means = rng.randint(-20, 20, (n_components, n_features))
assert_true(g.n_components == n_components)
assert_true(g.covariance_type == covariance_type)
g.weights_ = weights
assert_array_almost_equal(g.weights_, weights)
g.means_ = means
assert_array_almost_equal(g.means_, means)
covars = (0.1 + 2 * rng.rand(n_components, n_features)) ** 2
g.covars_ = covars
assert_array_almost_equal(g.covars_, covars)
assert_raises(ValueError, g._set_covars, [])
assert_raises(ValueError, g._set_covars,
np.zeros((n_components - 2, n_features)))
assert_raises(ValueError, mixture.GMM, n_components=20,
covariance_type='badcovariance_type')
def test_aic():
# Test the aic and bic criteria
n_samples, n_dim, n_components = 50, 3, 2
X = rng.randn(n_samples, n_dim)
SGH = 0.5 * (X.var() + np.log(2 * np.pi)) # standard gaussian entropy
for cv_type in ['full', 'tied', 'diag', 'spherical']:
g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
random_state=rng, min_covar=1e-7)
g.fit(X)
aic = 2 * n_samples * SGH * n_dim + 2 * g._n_parameters()
bic = (2 * n_samples * SGH * n_dim +
np.log(n_samples) * g._n_parameters())
bound = n_dim * 3. / np.sqrt(n_samples)
assert_true(np.abs(g.aic(X) - aic) / n_samples < bound)
assert_true(np.abs(g.bic(X) - bic) / n_samples < bound)
def _EM_VMM_GMM_maximisation_step(self):
'''Maximisation step SH : 7June2013
'''
self.pi_hat = np.sum(self.zij,axis=0)/float(self.n_instances)
self.mu_list_old = self.mu_list.copy()
self.kappa_list_old = self.kappa_list.copy()
self.mean_list_old = self.mean_list.copy()
self.std_list_old = self.std_list.copy()
for cluster_ident in range(self.n_clusters):
inst_tmp = (self.instance_array_complex.T * self.zij[:,cluster_ident]).T
N= np.sum(self.zij[:,cluster_ident])
#calculate the best fit for this cluster - all dimensions at once.... using new approximations
#VMM part
self.kappa_list[cluster_ident,:], self.mu_list[cluster_ident,:], scale_fit1 = EM_VMM_calc_best_fit(inst_tmp, lookup=self.bessel_lookup_table, N=N)
#GMM part
self.std_list[cluster_ident,:], self.mean_list[cluster_ident,:] = EM_GMM_calc_best_fit(self.instance_array_amps, self.zij[:,cluster_ident])
#Prevent ridiculous situations happening....
self.kappa_list = np.clip(self.kappa_list,0.1,300)
self.std_list = np.clip(self.std_list,0.001,300)
self._EM_VMM_GMM_check_convergence()
def gmm_cv(fv_train,target_train,fv_test,target_test):
####---- cross validation of train dataset, gridsearch the best parameters for gmm
n_classes = len(np.unique(target_train))
n_components_max = 7
n_components_range = range(1, n_components_max)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a mixture of Gaussians with EM
gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type)
gmm.means_ = np.array([fv_train[target_train == i].mean(axis=0)
for i in xrange(n_classes)])
gmm.fit(fv_train_transformed)
target_train_pred = gmm.predict(fv_train_transformed)
train_accuracy = np.mean(target_train_pred == target_train) * 100
print cv_type, n_components, ' Train accuracy: %.1f' % train_accuracy
def compute_super_codebook(self, feature_size):
'''
Merges the GMMs that were computed for each class
into one GMM (super codebook).
@param feature_size: dimensionality of a feature vector
'''
# Get number of classes
ncl = len(self.labels)
# Compute vocabsize per class
vocab_size_per_cl = max(1, self.vocab_size / ncl)
# Create GMM for overall repr
print >> sys.stderr, 'Using GMM with', self.vocab_size, 'and', vocab_size_per_cl , 'per class.'
self.gmm = GMM(self.vocab_size, n_iter=1, params='w', covariance_type='diag')
# Init by fitting with ones
self.gmm.fit(np.ones((self.vocab_size, feature_size)))
# Overwrite values from supervised codebook GMM by class GMMs
index = 0
for _, sgmm in self.gmms.iteritems():
vocab_size_per_cl = len(sgmm.means_)
self.gmm.means_ [index:index + vocab_size_per_cl, :] = sgmm.means_
self.gmm.covars_ [index:index + vocab_size_per_cl, :] = sgmm.covars_
index += vocab_size_per_cl
# Set uniform GMM weights
self.gmm.weights_ = np.ones(self.vocab_size) / float(self.vocab_size)
return
def classify_proba(self, features):
'''
Returns the GMM predictions for the features of each component
'''
return self.gmm.predict_proba(features)
def score_samples(self, features):
'''
Return the GMM scores for the features
'''
return self.gmm.score_samples(features)
def train(self, datadict, labels):
'''
Trains the GMM Model with supervised codebooks
and soft quantization
'''
super(BoFModelSuper, self).train(datadict, labels, rand_features=True)
self.featuresize = self.vocab_size
return
def gmmRemovingOutlierForClassifier():
"""
use GMM model to remove outlier
:return: NA
"""
# load data
X_train = np.load('inputClf_small/X_train.npy')
y_train = np.load('inputClf_small/y_train.npy')
y_train_price = np.load('inputClf_small/y_train_price.npy')
# classifier initialize
classifier = GMM(n_components=2,covariance_type='full', init_params='wmc', n_iter=20)
# cluster initializing
X_train1 = X_train[np.where(y_train==0)[0], :]
X_train2 = X_train[np.where(y_train==1)[0], :]
cluster1 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train1)
cluster1 = cluster1.cluster_centers_
cluster2 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train2)
cluster2 = cluster2.cluster_centers_
clusters = np.concatenate((cluster1, cluster2), axis=0)
classifier.means_ = clusters
# Train the other parameters using the EM algorithm.
classifier.fit(X_train)
# predict
y_train_pred = classifier.predict(X_train)
train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
print "Keep {}% data.".format(train_accuracy)
# keep the data which are not outliers
y_train_pred = y_train_pred.reshape((y_train_pred.shape[0], 1))
X_train = X_train[np.where(y_train==y_train_pred)[0], :]
y_train_price = y_train_price[np.where(y_train==y_train_pred)[0], :]
y_train = y_train[np.where(y_train==y_train_pred)[0], :]
np.save('inputClf_GMMOutlierRemoval/X_train', X_train)
np.save('inputClf_GMMOutlierRemoval/y_train', y_train)
np.save('inputClf_GMMOutlierRemoval/y_train_price', y_train_price)
def zero_center_normalize(df, samples, logInput=False, method='median'):
'''
Transforming input peptide abundance table into log2-scale and centralize to zero.
Inputs:
df : dataframe of peptide abundaces
samples: column names of selected samples
logInput: input abundances are already in log scale
method: method for estimating zero point
'''
assert method in ('median', 'average', 'GMM'), \
'Zero centering method has to be among median, average or GMM!'
if not logInput:
# convert abundances to log2 scale
df[samples] = df[samples].apply(np.log2)
if method == 'average':
norm_scale = np.nanmean(df[samples], axis=0)
elif method == 'median':
norm_scale = np.nanmedian(df[samples], axis=0)
elif method == 'GMM':
''' two-component Gaussian mixture model '''
from sklearn.mixture import GMM
gmm = GMM(2)
norm_scale = []
for sp in samples:
v = df[sp].values
v = v[np.logical_not(np.isnan(v))]
v = v[np.logical_not(np.isinf(v))]
try:
gmm.fit(np.matrix(v.values).T)
vmean = gmm.means_[np.argmin(gmm.covars_)]
norm_scale.append(vmean)
except:
norm_scale.append(np.nanmean(v))
norm_scale = np.array(norm_scale)
df[samples] = df[samples] - norm_scale
return df
def create_random_gmm(n_mix, n_features, covariance_type, prng=0):
prng = check_random_state(prng)
g = GMM(n_mix, covariance_type=covariance_type)
g.means_ = prng.randint(-20, 20, (n_mix, n_features))
g.covars_ = make_covar_matrix(covariance_type, n_mix, n_features)
g.weights_ = normalized(prng.rand(n_mix))
return g
def main():
pi = np.array([0.3, 0.5, 0.2])
mu = np.array([[1,1], [-1,-1], [-1,1]])*3
sigma = np.array([
[[1,0], [0,1]],
[[2,0], [0,2]],
[[0.5,0], [0, 0.5]],
])
X, C = generate_data(pi, mu, sigma, 1000)
plt.scatter(X[:,0], X[:,1], c=C, s=100, alpha=0.5)
plt.show()
# sklearn
gmm = GMM(n_components=3, covariance_type='full')
gmm.fit(X)
print "pi:", gmm.weights_
print "mu:", gmm.means_
print "sigma:", gmm.covars_
pi2, mu2, sigma2, L = expectation_maximization(X, len(pi))
print "pi:", pi2
print "mu:", mu2
print "sigma:", sigma2
plt.plot(L)
plt.show()
def show_fit(filename, ax=None):
"""Util function. Show a gaussian fit on nearest distances.
Usage example:
np.mean([show_fit(f) for f in list_naive_npy])
"""
from sklearn.mixture import GMM
X = np.load("{}".format(filename))
dist2nearest = np.array(X).reshape(-1, 1)
if dist2nearest.shape[0] < 2:
print("Cannot fit a Gaussian with two distances.")
return
dist2nearest_2 = -(np.array(sorted(dist2nearest)).reshape(-1, 1))
dist2nearest = np.array(list(dist2nearest_2) +
list(dist2nearest)).reshape(-1, 1)
gmm = GMM(n_components=3)
gmm.fit(dist2nearest)
plt.hist(dist2nearest, bins=50, normed=True)
linspace = np.linspace(-1, 1, 1000)[:, np.newaxis]
plt.plot(linspace, np.exp(gmm.score_samples(linspace)[0]), 'r')
lin = np.linspace(0, 1, 10000)[:, np.newaxis]
pred = gmm.predict(linspace)
argmax = np.argmax(gmm.means_)
idx = np.min(np.where(pred == argmax)[0])
plt.axvline(x=lin[idx], linestyle='--', color='r')
plt.show()
return lin[idx] # threshold
def _gaussian_fit(array):
if array.shape[0] < 2:
logging.error("Cannot fit a Gaussian with two distances.")
return 0
array_2 = -(np.array(sorted(array)).reshape(-1, 1))
array = np.array(list(array_2) + list(array)).reshape(-1, 1)
try:
# new sklearn GMM
gmm = mixture.GaussianMixture(n_components=3,
covariance_type='diag')
gmm.fit(array)
# gmmmean = np.max(gmm.means_)
# gmmsigma = gmm.covariances_[np.argmax(gmm.means_)]
except AttributeError:
# use old sklearn method
gmm = mixture.GMM(n_components=3)
gmm.fit(array)
# gmmmean = np.max(gmm.means_)
# gmmsigma = gmm.covars_[np.argmax(gmm.means_)]
# Extract optimal threshold
plt.hist(array, bins=50, normed=True) # debug, print
lin = np.linspace(0, 1, 10000)[:, np.newaxis]
# plt.plot(lin, np.exp(gmm.score_samples(lin)[0]), 'r')
pred = gmm.predict(lin)
try:
idx = np.min(np.where(pred == np.argmax(gmm.means_))[0])
except ValueError:
# print("Error", np.unique(pred))
idx = 0
plt.axvline(x=lin[idx], linestyle='--', color='r')
# plt.gcf().savefig("threshold_naive{}.png".format(k))
plt.close()
threshold = lin[idx][0] # threshold
# np.save("threshold_naive", threshold)
return threshold
def trainGMM(features, nComp, covType = 'diag'):
Gs = []
for f in features:
gmm = mixture.GMM(n_components=int(nComp), covariance_type = covType)
Gs.append(gmm.fit(f))
return Gs
def encodeGMM(method, gmm, data,
normalize=['ssr', 'l2g'], relevance=28, update='wmc',
posteriors=None):
"""
Encoding scheme adapting data from a GMM
parameters:
method: an encoding method to call, currently acceptable
'supervector', 'fisher', 'vlad',
gmm: the background gmm = vocabulary = universal background model
data: typiclly the features, row wise order
normalize: global and local normalization schemes
relevance: (gmm-supervector-specific): relevance factor for mixing
update: (gmm-supervector-specific): which parts to use for mixing
posteriors: if given, they won't be computed again
returns: encoded vector
"""
if posteriors == None:
# compute posteriors from gmm
posteriors = getPosteriors(gmm, data)
if 'supervector' in method:
enc = supervector(gmm, data, posteriors, relevance=relevance,
update=update)
elif 'fisher' in method:
enc = fisherEncode(gmm, data,
posteriors, normalize,
fv_components=update)
elif 'vlad' in method:
enc = vlad(data, gmm.means_, posteriors, gmm.means_.shape[0],
normalize )
else:
raise ValueError('unknown encoding method {}'.format(method))
norm_enc = normalizeEnc(enc, normalize)
return norm_enc
def fitGMM(data, num_clusters, iterations, update, covar_type):
print ('fit gmm with num_clusters {}, params {}, n_iter {}, covar_type '
'{}'.format(num_clusters, update, iterations, covar_type))
gmm = mixture.GMM(num_clusters, n_iter=iterations,
params=update, covariance_type=covar_type)
gmm.fit(data)
return gmm
methods.py 文件源码
项目:South-African-Heart-Disease-data-analysis-using-python
作者: khushi4tiwari
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def gmm(X,y,M,C,K=4,cov_type='diag',reps=10):
# Fit Gaussian mixture model
gmm = GMM(n_components=K, covariance_type=cov_type, n_init=reps, params='wmc').fit(X)
cls = gmm.predict(X) # extract cluster labels
cds = gmm.means_ # extract cluster centroids (means of gaussians)
covs = gmm.covars_ # extract cluster shapes (covariances of gaussians)
if cov_type == 'diag':
new_covs = np.zeros([K,M,M])
count = 0
for elem in covs:
temp_m = np.zeros([M,M])
for i in range(len(elem)):
temp_m[i][i] = elem[i]
new_covs[count] = temp_m
count += 1
covs = new_covs
clusterPlot(X,cls,K,C,y,cds,covs)
methods.py 文件源码
项目:South-African-Heart-Disease-data-analysis-using-python
作者: khushi4tiwari
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def CVK(X,KRange,covar_type='diag',reps=10):
N, M = X.shape
T = len(KRange)
CVE = np.zeros((T,1))
# K-fold crossvalidation
CV = cross_validation.KFold(N,5,shuffle=True)
for t,K in enumerate(KRange):
print('Fitting model for K={0}\n'.format(K))
# Fit Gaussian mixture model
gmm = GMM(n_components=K, covariance_type=covar_type, n_init=reps, params='wmc').fit(X)
# For each crossvalidation fold
for train_index, test_index in CV:
# extract training and test set for current CV fold
X_train = X[train_index]
X_test = X[test_index]
# Fit Gaussian mixture model to X_train
gmm = GMM(n_components=K, covariance_type=covar_type, n_init=reps, params='wmc').fit(X_train)
# compute negative log likelihood of X_test
CVE[t] += -gmm.score(X_test).sum()
#print CVE[t]
# Plot results
return CVE
#figure(); hold(True)
#plot(KRange, 2*CVE)
#legend(['Crossvalidation'])
#xlabel('K')
#show()
def trainGMM(features, nComp, covType = 'diag'):
Gs = []
for f in features:
gmm = mixture.GMM(n_components=int(nComp), covariance_type = covType)
Gs.append(gmm.fit(f))
return Gs
def Inner_GMM(SingleGenreSongs,numOfClusters):
means = [item.mean_timbreVec for item in SingleGenreSongs]
means_array = np.array(means)
model = mixture.GMM(n_components = numOfClusters,covariance_type = 'full',n_init = 10)
model.fit(means)
lbls = model.predict(means)
clusters_mean = [[] for i in range(numOfClusters)];
clusters_cov = [[] for i in range(numOfClusters)];
clusters_cnt = [0 for i in range(numOfClusters)];
ret_clusters_mean = [[] for i in range(numOfClusters)];
ret_clusters_cov = [[] for i in range(numOfClusters)];
assert(len(lbls) == len(SingleGenreSongs))
for i in range(len(lbls)):
lb = lbls[i]
clusters_mean[lb].append(np.array(SingleGenreSongs[i].mean_timbreVec))
clusters_cov[lb].append(np.array(SingleGenreSongs[i].cov_timbreVec))
clusters_cnt[lb] += 1
for i in range(numOfClusters):
# print type(clusters_mean[i][0])
ret_clusters_mean[i] = sum(clusters_mean[i])
ret_clusters_cov[i] = sum(clusters_cov[i])
# print type(ret_clusters_mean[i][0])
ret_clusters_mean[i] = [(item)/clusters_cnt[i] for item in ret_clusters_mean[i]]
ret_clusters_cov[i] = [(item)/clusters_cnt[i] for item in ret_clusters_cov[i]]
return ret_clusters_mean,ret_clusters_cov
# return model.means_,model.covars_
def dictionary(self, descriptors, N):
gmm = mixture.GMM(n_components=N, covariance_type='full')
gmm.fit(descriptors)
self.means = np.float32(gmm.means_)
self.covs = np.float32(gmm.covars_)
self.weights = np.float32(gmm.weights_)
return self.means, self.covs, self.weights
def test_train(self, params='wmc'):
g = mixture.GMM(n_components=self.n_components,
covariance_type=self.covariance_type)
g.weights_ = self.weights
g.means_ = self.means
g.covars_ = 20 * self.covars[self.covariance_type]
# Create a training set by sampling from the predefined distribution.
X = g.sample(n_samples=100)
g = self.model(n_components=self.n_components,
covariance_type=self.covariance_type,
random_state=rng, min_covar=1e-1,
n_iter=1, init_params=params)
g.fit(X)
# Do one training iteration at a time so we can keep track of
# the log likelihood to make sure that it increases after each
# iteration.
trainll = []
for _ in range(5):
g.params = params
g.init_params = ''
g.fit(X)
trainll.append(self.score(g, X))
g.n_iter = 10
g.init_params = ''
g.params = params
g.fit(X) # finish fitting
# Note that the log likelihood will sometimes decrease by a
# very small amount after it has more or less converged due to
# the addition of min_covar to the covariance (to prevent
# underflow). This is why the threshold is set to -0.5
# instead of 0.
delta_min = np.diff(trainll).min()
self.assertTrue(
delta_min > self.threshold,
"The min nll increase is %f which is lower than the admissible"
" threshold of %f, for model %s. The likelihoods are %s."
% (delta_min, self.threshold, self.covariance_type, trainll))
def test_multiple_init():
# Test that multiple inits does not much worse than a single one
X = rng.randn(30, 5)
X[:10] += 2
g = mixture.GMM(n_components=2, covariance_type='spherical',
random_state=rng, min_covar=1e-7, n_iter=5)
train1 = g.fit(X).score(X).sum()
g.n_init = 5
train2 = g.fit(X).score(X).sum()
assert_true(train2 >= train1 - 1.e-2)
def test_n_parameters():
# Test that the right number of parameters is estimated
n_samples, n_dim, n_components = 7, 5, 2
X = rng.randn(n_samples, n_dim)
n_params = {'spherical': 13, 'diag': 21, 'tied': 26, 'full': 41}
for cv_type in ['full', 'tied', 'diag', 'spherical']:
g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
random_state=rng, min_covar=1e-7, n_iter=1)
g.fit(X)
assert_true(g._n_parameters() == n_params[cv_type])
def test_1d_1component():
# Test all of the covariance_types return the same BIC score for
# 1-dimensional, 1 component fits.
n_samples, n_dim, n_components = 100, 1, 1
X = rng.randn(n_samples, n_dim)
g_full = mixture.GMM(n_components=n_components, covariance_type='full',
random_state=rng, min_covar=1e-7, n_iter=1)
g_full.fit(X)
g_full_bic = g_full.bic(X)
for cv_type in ['tied', 'diag', 'spherical']:
g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
random_state=rng, min_covar=1e-7, n_iter=1)
g.fit(X)
assert_array_almost_equal(g.bic(X), g_full_bic)
def test_verbose_first_level():
# Create sample data
X = rng.randn(30, 5)
X[:10] += 2
g = mixture.GMM(n_components=2, n_init=2, verbose=1)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
g.fit(X)
finally:
sys.stdout = old_stdout