def annopred_inf(beta_hats, pr_sigi, n=1000, reference_ld_mats=None, ld_window_size=100):
"""
infinitesimal model with snp-specific heritability derived from annotation
used as the initial values for MCMC of non-infinitesimal model
"""
num_betas = len(beta_hats)
updated_betas = sp.empty(num_betas)
m = len(beta_hats)
for i, wi in enumerate(range(0, num_betas, ld_window_size)):
start_i = wi
stop_i = min(num_betas, wi + ld_window_size)
curr_window_size = stop_i - start_i
Li = 1.0/pr_sigi[start_i: stop_i]
D = reference_ld_mats[i]
A = (n/(1))*D + sp.diag(Li)
A_inv = linalg.pinv(A)
updated_betas[start_i: stop_i] = sp.dot(A_inv / (1.0/n) , beta_hats[start_i: stop_i]) # Adjust the beta_hats
return updated_betas
python类empty()的实例源码
def split_data(self,n,v=5):
''' The function split the data into v folds. Whatever the number of sample per class
Input:
n : the number of samples
v : the number of folds
Output: None
'''
step = n //v # Compute the number of samples in each fold
sp.random.seed(1) # Set the random generator to the same initial state
t = sp.random.permutation(n) # Generate random sampling of the indices
indices=[]
for i in range(v-1): # group in v fold
indices.append(t[i*step:(i+1)*step])
indices.append(t[(v-1)*step:n])
for i in range(v):
self.iT.append(sp.asarray(indices[i]))
l = range(v)
l.remove(i)
temp = sp.empty(0,dtype=sp.int64)
for j in l:
temp = sp.concatenate((temp,sp.asarray(indices[j])))
self.it.append(temp)
def glmnetSet(opts = None):
import scipy
# default options
options = {
"weights" : scipy.empty([0]),
"offset" : scipy.empty([0]),
"alpha" : scipy.float64(1.0),
"nlambda" : scipy.int32(100),
"lambda_min" : scipy.empty([0]),
"lambdau" : scipy.empty([0]),
"standardize" : True,
"intr" : True,
"thresh" : scipy.float64(1e-7),
"dfmax" : scipy.empty([0]),
"pmax" : scipy.empty([0]),
"exclude" : scipy.empty([0], dtype = scipy.integer),
"penalty_factor" : scipy.empty([0]),
"cl" : scipy.array([[scipy.float64(-scipy.inf)], [scipy.float64(scipy.inf)]]),
"maxit" : scipy.int32(1e5),
"gtype" : [],
"ltype" : 'Newton',
"standardize_resp" : False,
"mtype" : 'ungrouped'
}
# quick return if no user opts
if opts == None:
print('pdco default options:')
print(options)
return options
# if options are passed in by user, update options with values from opts
optsInOptions = set(opts.keys()) - set(options.keys());
if len(optsInOptions) > 0: # assert 'opts' keys are subsets of 'options' keys
print(optsInOptions, ' : unknown option for glmnetSet')
raise ValueError('attempting to set glmnet options that are not known to glmnetSet')
else:
options = merge_dicts(options, opts)
return options
def split_data(self,n,v=5):
''' The function split the data into v folds. Whatever the number of sample per class
Input:
n : the number of samples
v : the number of folds
Output: None
'''
step = n //v # Compute the number of samples in each fold
sp.random.seed(1) # Set the random generator to the same initial state
t = sp.random.permutation(n) # Generate random sampling of the indices
indices=[]
for i in range(v-1): # group in v fold
indices.append(t[i*step:(i+1)*step])
indices.append(t[(v-1)*step:n])
for i in range(v):
self.iT.append(sp.asarray(indices[i]))
l = range(v)
l.remove(i)
temp = sp.empty(0,dtype=sp.int64)
for j in l:
temp = sp.concatenate((temp,sp.asarray(indices[j])))
self.it.append(temp)
def _parse_plink_snps_(genotype_file, snp_indices):
plinkf = plinkfile.PlinkFile(genotype_file)
samples = plinkf.get_samples()
num_individs = len(samples)
num_snps = len(snp_indices)
raw_snps = sp.empty((num_snps,num_individs),dtype='int8')
#If these indices are not in order then we place them in the right place while parsing SNPs.
snp_order = sp.argsort(snp_indices)
ordered_snp_indices = list(snp_indices[snp_order])
ordered_snp_indices.reverse()
print 'Iterating over file to load SNPs'
snp_i = 0
next_i = ordered_snp_indices.pop()
line_i = 0
max_i = ordered_snp_indices[0]
while line_i <= max_i:
if line_i < next_i:
plinkf.next()
elif line_i==next_i:
line = plinkf.next()
snp = sp.array(line, dtype='int8')
bin_counts = line.allele_counts()
if bin_counts[-1]>0:
mode_v = sp.argmax(bin_counts[:2])
snp[snp==3] = mode_v
s_i = snp_order[snp_i]
raw_snps[s_i]=snp
if line_i < max_i:
next_i = ordered_snp_indices.pop()
snp_i+=1
line_i +=1
plinkf.close()
assert snp_i==len(raw_snps), 'Failed to parse SNPs?'
num_indivs = len(raw_snps[0])
freqs = sp.sum(raw_snps,1, dtype='float32')/(2*float(num_indivs))
return raw_snps, freqs
def learn(self,x,y):
'''
Function that learns the GMM with ridge regularizationb from training samples
Input:
x : the training samples
y : the labels
Output:
the mean, covariance and proportion of each class, as well as the spectral decomposition of the covariance matrix
'''
## Get information from the data
C = sp.unique(y).shape[0]
#C = int(y.max(0)) # Number of classes
n = x.shape[0] # Number of samples
d = x.shape[1] # Number of variables
eps = sp.finfo(sp.float64).eps
## Initialization
self.ni = sp.empty((C,1)) # Vector of number of samples for each class
self.prop = sp.empty((C,1)) # Vector of proportion
self.mean = sp.empty((C,d)) # Vector of means
self.cov = sp.empty((C,d,d)) # Matrix of covariance
self.Q = sp.empty((C,d,d)) # Matrix of eigenvectors
self.L = sp.empty((C,d)) # Vector of eigenvalues
self.classnum = sp.empty(C).astype('uint8')
## Learn the parameter of the model for each class
for c,cR in enumerate(sp.unique(y)):
j = sp.where(y==(cR))[0]
self.classnum[c] = cR # Save the right label
self.ni[c] = float(j.size)
self.prop[c] = self.ni[c]/n
self.mean[c,:] = sp.mean(x[j,:],axis=0)
self.cov[c,:,:] = sp.cov(x[j,:],bias=1,rowvar=0) # Normalize by ni to be consistent with the update formulae
# Spectral decomposition
L,Q = linalg.eigh(self.cov[c,:,:])
idx = L.argsort()[::-1]
self.L[c,:] = L[idx]
self.Q[c,:,:]=Q[:,idx]
def predict(self,xt,tau=None,proba=None):
'''
Function that predict the label for sample xt using the learned model
Inputs:
xt: the samples to be classified
Outputs:
y: the class
K: the decision value for each class
'''
## Get information from the data
nt = xt.shape[0] # Number of testing samples
C = self.ni.shape[0] # Number of classes
## Initialization
K = sp.empty((nt,C))
if tau is None:
TAU=self.tau
else:
TAU=tau
for c in range(C):
invCov,logdet = self.compute_inverse_logdet(c,TAU)
cst = logdet - 2*sp.log(self.prop[c]) # Pre compute the constant
xtc = xt-self.mean[c,:]
temp = sp.dot(invCov,xtc.T).T
K[:,c] = sp.sum(xtc*temp,axis=1)+cst
del temp,xtc
##
## Assign the label save in classnum to the minimum value of K
yp = self.classnum[sp.argmin(K,1)]
## Reassign label with real value
if proba is None:
return yp
else:
return yp,K
def glmnetCoef(obj, s = None, exact = False):
if s is None:
s = obj['lambdau']
if exact and len(s) > 0:
raise NotImplementedError('exact = True not implemented in glmnetCoef')
result = glmnetPredict(obj, scipy.empty([0]), s, 'coefficients')
return(result)
def glmnetSet(opts = None):
import scipy
# default options
options = {
"weights" : scipy.empty([0]),
"offset" : scipy.empty([0]),
"alpha" : scipy.float64(1.0),
"nlambda" : scipy.int32(100),
"lambda_min" : scipy.empty([0]),
"lambdau" : scipy.empty([0]),
"standardize" : True,
"intr" : True,
"thresh" : scipy.float64(1e-7),
"dfmax" : scipy.empty([0]),
"pmax" : scipy.empty([0]),
"exclude" : scipy.empty([0], dtype = scipy.integer),
"penalty_factor" : scipy.empty([0]),
"cl" : scipy.array([[scipy.float64(-scipy.inf)], [scipy.float64(scipy.inf)]]),
"maxit" : scipy.int32(1e5),
"gtype" : [],
"ltype" : 'Newton',
"standardize_resp" : False,
"mtype" : 'ungrouped'
}
# quick return if no user opts
if opts == None:
print('pdco default options:')
print(options)
return options
# if options are passed in by user, update options with values from opts
optsInOptions = set(opts.keys()) - set(options.keys());
if len(optsInOptions) > 0: # assert 'opts' keys are subsets of 'options' keys
print(optsInOptions, ' : unknown option for glmnetSet')
raise ValueError('attempting to set glmnet options that are not known to glmnetSet')
else:
options = merge_dicts(options, opts)
return options
def glmnetCoef(obj, s = None, exact = False):
if s is None:
s = obj['lambdau']
if exact and len(s) > 0:
raise NotImplementedError('exact = True not implemented in glmnetCoef')
result = glmnetPredict(obj, scipy.empty([0]), s, 'coefficients')
return(result)
def glmnetCoef(obj, s = None, exact = False):
if s is None:
s = obj['lambdau']
if exact and len(s) > 0:
raise NotImplementedError('exact = True not implemented in glmnetCoef')
result = glmnetPredict(obj, scipy.empty([0]), s, 'coefficients')
return(result)
def glmnetCoef(obj, s = None, exact = False):
if s is None:
s = obj['lambdau']
if exact and len(s) > 0:
raise NotImplementedError('exact = True not implemented in glmnetCoef')
result = glmnetPredict(obj, scipy.empty([0]), s, 'coefficients')
return(result)
def glmnetSet(opts = None):
import scipy
# default options
options = {
"weights" : scipy.empty([0]),
"offset" : scipy.empty([0]),
"alpha" : scipy.float64(1.0),
"nlambda" : scipy.int32(100),
"lambda_min" : scipy.empty([0]),
"lambdau" : scipy.empty([0]),
"standardize" : True,
"intr" : True,
"thresh" : scipy.float64(1e-7),
"dfmax" : scipy.empty([0]),
"pmax" : scipy.empty([0]),
"exclude" : scipy.empty([0], dtype = scipy.integer),
"penalty_factor" : scipy.empty([0]),
"cl" : scipy.array([[scipy.float64(-scipy.inf)], [scipy.float64(scipy.inf)]]),
"maxit" : scipy.int32(1e5),
"gtype" : [],
"ltype" : 'Newton',
"standardize_resp" : False,
"mtype" : 'ungrouped'
}
# quick return if no user opts
if opts == None:
print('pdco default options:')
print(options)
return options
# if options are passed in by user, update options with values from opts
optsInOptions = set(opts.keys()) - set(options.keys());
if len(optsInOptions) > 0: # assert 'opts' keys are subsets of 'options' keys
print(optsInOptions, ' : unknown option for glmnetSet')
raise ValueError('attempting to set glmnet options that are not known to glmnetSet')
else:
options = merge_dicts(options, opts)
return options
def fake_mldata(columns_dict, dataname, matfile, ordering=None):
"""Create a fake mldata data set.
Parameters
----------
columns_dict : dict, keys=str, values=ndarray
Contains data as columns_dict[column_name] = array of data.
dataname : string
Name of data set.
matfile : string or file object
The file name string or the file-like object of the output file.
ordering : list, default None
List of column_names, determines the ordering in the data set.
Notes
-----
This function transposes all arrays, while fetch_mldata only transposes
'data', keep that into account in the tests.
"""
datasets = dict(columns_dict)
# transpose all variables
for name in datasets:
datasets[name] = datasets[name].T
if ordering is None:
ordering = sorted(list(datasets.keys()))
# NOTE: setting up this array is tricky, because of the way Matlab
# re-packages 1D arrays
datasets['mldata_descr_ordering'] = sp.empty((1, len(ordering)),
dtype='object')
for i, name in enumerate(ordering):
datasets['mldata_descr_ordering'][0, i] = name
scipy.io.savemat(matfile, datasets, oned_as='column')
def fit(self, X):
# Constants
max_iter = self.max_iter
tol = self.tol
n_samples, n_features = X.shape
n_components = self.n_components
# Initialize parameters
self._init_params(X)
# Initialize
responsibility = sp.empty((n_samples, n_components))
log_likelihood = self.log_likelihood(X)
for iter in range(max_iter):
# E step
for n in range(n_samples):
for k in range(n_components):
responsibility[n][k] = self.pdf(X[n], k) / self.pdf(X[n])
# M step
eff = sp.sum(responsibility, axis=0)
for k in range(n_components):
# Update mean
mean = sp.dot(responsibility[:, k], X) / eff[k]
# Update covariance
cov = sp.zeros((n_features, n_features))
for n in range(n_samples):
cov += responsibility[n][k] * sp.outer(X[n] - mean, X[n] - mean)
cov /= eff[k]
# Update the k component
self.comps[k] = multivariate_normal(mean, cov, allow_singular=True)
# Update mixture coefficient
self.coef[k] = eff[k] / n_samples
# Convergent test
log_likelihood_new = self.log_likelihood(X)
diff = log_likelihood_new - log_likelihood
print('GMM: {0:5d}: {1:10.5e} {2:10.5e}'.format(
iter, log_likelihood_new, spla.norm(diff) / spla.norm(log_likelihood)))
if (spla.norm(diff) < tol * spla.norm(log_likelihood)):
break
log_likelihood = log_likelihood_new
return self
def open_data(filename):
'''
The function open and load the image given its name.
The type of the data is checked from the file and the scipy array is initialized accordingly.
Input:
filename: the name of the file
Output:
im: the data cube
GeoTransform: the geotransform information
Projection: the projection information
'''
data = gdal.Open(filename,gdal.GA_ReadOnly)
if data is None:
print 'Impossible to open '+filename
exit()
nc = data.RasterXSize
nl = data.RasterYSize
d = data.RasterCount
# Get the type of the data
gdal_dt = data.GetRasterBand(1).DataType
if gdal_dt == gdal.GDT_Byte:
dt = 'uint8'
elif gdal_dt == gdal.GDT_Int16:
dt = 'int16'
elif gdal_dt == gdal.GDT_UInt16:
dt = 'uint16'
elif gdal_dt == gdal.GDT_Int32:
dt = 'int32'
elif gdal_dt == gdal.GDT_UInt32:
dt = 'uint32'
elif gdal_dt == gdal.GDT_Float32:
dt = 'float32'
elif gdal_dt == gdal.GDT_Float64:
dt = 'float64'
elif gdal_dt == gdal.GDT_CInt16 or gdal_dt == gdal.GDT_CInt32 or gdal_dt == gdal.GDT_CFloat32 or gdal_dt == gdal.GDT_CFloat64 :
dt = 'complex64'
else:
print 'Data type unkown'
exit()
# Initialize the array
if d == 1:
im = sp.empty((nl,nc),dtype=dt)
else:
im = sp.empty((nl,nc,d),dtype=dt)
if d == 1:
im[:,:]=data.GetRasterBand(1).ReadAsArray()
else :
for i in range(d):
im[:,:,i]=data.GetRasterBand(i+1).ReadAsArray()
GeoTransform = data.GetGeoTransform()
Projection = data.GetProjection()
data = None
return im,GeoTransform,Projection
def learn(self,x,y):
'''
Function that learns the GMM with ridge regularizationb from training samples
Input:
x : the training samples
y : the labels
Output:
the mean, covariance and proportion of each class, as well as the spectral decomposition of the covariance matrix
'''
## Get information from the data
C = sp.unique(y).shape[0]
#C = int(y.max(0)) # Number of classes
n = x.shape[0] # Number of samples
d = x.shape[1] # Number of variables
eps = sp.finfo(sp.float64).eps
## Initialization
self.ni = sp.empty((C,1)) # Vector of number of samples for each class
self.prop = sp.empty((C,1)) # Vector of proportion
self.mean = sp.empty((C,d)) # Vector of means
self.cov = sp.empty((C,d,d)) # Matrix of covariance
self.Q = sp.empty((C,d,d)) # Matrix of eigenvectors
self.L = sp.empty((C,d)) # Vector of eigenvalues
self.classnum = sp.empty(C).astype('uint8')
## Learn the parameter of the model for each class
for c,cR in enumerate(sp.unique(y)):
j = sp.where(y==(cR))[0]
self.classnum[c] = cR # Save the right label
self.ni[c] = float(j.size)
self.prop[c] = self.ni[c]/n
self.mean[c,:] = sp.mean(x[j,:],axis=0)
self.cov[c,:,:] = sp.cov(x[j,:],bias=1,rowvar=0) # Normalize by ni to be consistent with the update formulae
# Spectral decomposition
L,Q = linalg.eigh(self.cov[c,:,:])
idx = L.argsort()[::-1]
self.L[c,:] = L[idx]
self.Q[c,:,:]=Q[:,idx]