def prior_log_loss(frac_pos, task = 'binary.classification'):
''' Baseline log loss. For multiplr classes ot labels return the volues for each column'''
eps = 1e-15
frac_pos_ = sp.maximum (eps, frac_pos)
if (task != 'multiclass.classification'): # binary case
frac_neg = 1-frac_pos
frac_neg_ = sp.maximum (eps, frac_neg)
pos_class_log_loss_ = - frac_pos * np.log(frac_pos_)
neg_class_log_loss_ = - frac_neg * np.log(frac_neg_)
base_log_loss = pos_class_log_loss_ + neg_class_log_loss_
# base_log_loss = mvmean(base_log_loss)
# print('binary {}'.format(base_log_loss))
# In the multilabel case, the right thing i to AVERAGE not sum
# We return all the scores so we can normalize correctly later on
else: # multiclass case
fp = frac_pos_ / sum(frac_pos_) # Need to renormalize the lines in multiclass case
# Only ONE label is 1 in the multiclass case active for each line
pos_class_log_loss_ = - frac_pos * np.log(fp)
base_log_loss = np.sum(pos_class_log_loss_)
return base_log_loss
# sklearn implementations for comparison
python类sum()的实例源码
def num_lines (filename):
''' Count the number of lines of file'''
return sum(1 for line in open(filename))
def tp_filter(X, Y, feat_num=1000, verbose=True):
''' TP feature selection in the spirit of the winners of the KDD cup 2001
Only for binary classification and sparse matrices'''
if issparse(X) and len(Y.shape)==1 and len(set(Y))==2 and (sum(Y)/Y.shape[0])<0.1:
if verbose: print("========= Filtering features...")
Posidx=Y>0
#npos = sum(Posidx)
#Negidx=Y<=0
#nneg = sum(Negidx)
nz=X.nonzero()
mx=X[nz].max()
if X[nz].min()==mx: # sparse binary
if mx!=1: X[nz]=1
tp=csr_matrix.sum(X[Posidx,:], axis=0)
#fn=npos-tp
#fp=csr_matrix.sum(X[Negidx,:], axis=0)
#tn=nneg-fp
else:
tp=np.sum(X[Posidx,:]>0, axis=0)
#tn=np.sum(X[Negidx,:]<=0, axis=0)
#fn=np.sum(X[Posidx,:]<=0, axis=0)
#fp=np.sum(X[Negidx,:]>0, axis=0)
tp=np.ravel(tp)
idx=sorted(range(len(tp)), key=tp.__getitem__, reverse=True)
return idx[0:feat_num]
else:
feat_num = X.shape[1]
return range(feat_num)
def predict(self, X):
prediction = self.predict_method(X)
# Calibrate proba
if self.task != 'regression' and self.postprocessor!=None:
prediction = self.postprocessor.predict_proba(prediction)
# Keep only 2nd column because the second one is 1-first
if self.target_num==1 and len(prediction.shape)>1 and prediction.shape[1]>1:
prediction = prediction[:,1]
# Make sure the normalization is correct
if self.task=='multiclass.classification':
eps = 1e-15
norma = np.sum(prediction, axis=1)
for k in range(prediction.shape[0]):
prediction[k,:] /= sp.maximum(norma[k], eps)
return prediction
def fit(self, X, Y):
if len(Y.shape)==1:
Y = np.array([Y]).transpose() # Transform vector into column matrix
# This is NOT what we want: Y = Y.reshape( -1, 1 ), because Y.shape[1] out of range
self.n_target = Y.shape[1] # Num target values = num col of Y
self.n_label = len(set(Y.ravel())) # Num labels = num classes (categories of categorical var if n_target=1 or n_target if labels are binary )
# Create the right number of copies of the predictor instance
if len(self.predictors)!=self.n_target:
predictorInstance = self.predictors[0]
self.predictors = [predictorInstance]
for i in range(1,self.n_target):
self.predictors.append(copy.copy(predictorInstance))
# Fit all predictors
for i in range(self.n_target):
# Update the number of desired prodictos
if hasattr(self.predictors[i], 'n_estimators'):
self.predictors[i].n_estimators=self.n_estimators
# Subsample if desired
if self.balance:
pos = Y[:,i]>0
neg = Y[:,i]<=0
if sum(pos)<sum(neg):
chosen = pos
not_chosen = neg
else:
chosen = neg
not_chosen = pos
num = sum(chosen)
idx=filter(lambda(x): x[1]==True, enumerate(not_chosen))
idx=np.array(zip(*idx)[0])
np.random.shuffle(idx)
chosen[idx[0:min(num, len(idx))]]=True
# Train with chosen samples
self.predictors[i].fit(X[chosen,:],Y[chosen,i])
else:
self.predictors[i].fit(X,Y[:,i])
return
def get_batch_loss(self, input_batch, output_batch):
dynet.renew_cg()
# Dimension: maxSentLength * minibatch_size
wids = []
wids_reversed = []
# List of lists to store whether an input is
# present(1)/absent(0) for an example at a time step
# masks = [] # Dimension: maxSentLength * minibatch_size
# tot_words = 0
maxSentLength = max([len(sent) for sent in input_batch])
for j in range(maxSentLength):
wids.append([(self.src_vocab[sent[j]].i if len(sent)>j else self.src_vocab.END_TOK.i) for sent in input_batch])
wids_reversed.append([(self.src_vocab[sent[len(sent)- j-1]].i if len(sent)>j else self.src_vocab.END_TOK.i) for sent in input_batch])
# mask = [(1 if len(sent)>j else 0) for sent in input_batch]
# masks.append(mask)
#tot_words += sum(mask)
embedded_batch = self.embed_batch_seq(wids)
embedded_batch_reverse = self.embed_batch_seq(wids_reversed)
encoded_batch = self.encode_batch_seq(embedded_batch, embedded_batch_reverse)
# pass last hidden state of encoder to decoder
return self.decode_batch(encoded_batch, output_batch)
def plotFields(layer,fieldShape=None,channel=None,figOffset=1,cmap=None,padding=0.01):
# Receptive Fields Summary
try:
W = layer.W
except:
W = layer
wp = W.eval().transpose();
if len(np.shape(wp)) < 4: # Fully connected layer, has no shape
fields = np.reshape(wp,list(wp.shape[0:-1])+fieldShape)
else: # Convolutional layer already has shape
features, channels, iy, ix = np.shape(wp)
if channel is not None:
fields = wp[:,channel,:,:]
else:
fields = np.reshape(wp,[features*channels,iy,ix])
perRow = int(math.floor(math.sqrt(fields.shape[0])))
perColumn = int(math.ceil(fields.shape[0]/float(perRow)))
fig = mpl.figure(figOffset); mpl.clf()
# Using image grid
from mpl_toolkits.axes_grid1 import ImageGrid
grid = ImageGrid(fig,111,nrows_ncols=(perRow,perColumn),axes_pad=padding,cbar_mode='single')
for i in range(0,np.shape(fields)[0]):
im = grid[i].imshow(fields[i],cmap=cmap);
grid.cbar_axes[0].colorbar(im)
mpl.title('%s Receptive Fields' % layer.name)
# old way
# fields2 = np.vstack([fields,np.zeros([perRow*perColumn-fields.shape[0]] + list(fields.shape[1:]))])
# tiled = []
# for i in range(0,perColumn*perRow,perColumn):
# tiled.append(np.hstack(fields2[i:i+perColumn]))
#
# tiled = np.vstack(tiled)
# mpl.figure(figOffset); mpl.clf(); mpl.imshow(tiled,cmap=cmap); mpl.title('%s Receptive Fields' % layer.name); mpl.colorbar();
mpl.figure(figOffset+1); mpl.clf(); mpl.imshow(np.sum(np.abs(fields),0),cmap=cmap); mpl.title('%s Total Absolute Input Dependency' % layer.name); mpl.colorbar()
def plotFields(layer,fieldShape=None,channel=None,maxFields=25,figName='ReceptiveFields',cmap=None,padding=0.01):
# Receptive Fields Summary
W = layer.W
wp = W.eval().transpose();
if len(np.shape(wp)) < 4: # Fully connected layer, has no shape
fields = np.reshape(wp,list(wp.shape[0:-1])+fieldShape)
else: # Convolutional layer already has shape
features, channels, iy, ix = np.shape(wp)
if channel is not None:
fields = wp[:,channel,:,:]
else:
fields = np.reshape(wp,[features*channels,iy,ix])
fieldsN = min(fields.shape[0],maxFields)
perRow = int(math.floor(math.sqrt(fieldsN)))
perColumn = int(math.ceil(fieldsN/float(perRow)))
fig = mpl.figure(figName); mpl.clf()
# Using image grid
from mpl_toolkits.axes_grid1 import ImageGrid
grid = ImageGrid(fig,111,nrows_ncols=(perRow,perColumn),axes_pad=padding,cbar_mode='single')
for i in range(0,fieldsN):
im = grid[i].imshow(fields[i],cmap=cmap);
grid.cbar_axes[0].colorbar(im)
mpl.title('%s Receptive Fields' % layer.name)
# old way
# fields2 = np.vstack([fields,np.zeros([perRow*perColumn-fields.shape[0]] + list(fields.shape[1:]))])
# tiled = []
# for i in range(0,perColumn*perRow,perColumn):
# tiled.append(np.hstack(fields2[i:i+perColumn]))
#
# tiled = np.vstack(tiled)
# mpl.figure(figOffset); mpl.clf(); mpl.imshow(tiled,cmap=cmap); mpl.title('%s Receptive Fields' % layer.name); mpl.colorbar();
mpl.figure(figName+' Total'); mpl.clf(); mpl.imshow(np.sum(np.abs(fields),0),cmap=cmap); mpl.title('%s Total Absolute Input Dependency' % layer.name); mpl.colorbar()
def analytic_convolution_gaussian(mu1,covar1,mu2,covar2):
"""
The analytic vconvolution of two Gaussians is simply the sum of the two mean vectors
and the two convariance matrixes
--- INPUT ---
mu1 The mean of the first gaussian
covar1 The covariance matrix of of the first gaussian
mu2 The mean of the second gaussian
covar2 The covariance matrix of of the second gaussian
"""
muconv = mu1+mu2
covarconv = covar1+covar2
return muconv, covarconv
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def inner_product_to_infty(self,gf1,gf2):
"Inner product on non-compact domain"
factors = [s.get_scale_factor() for s in self.stencils]
factor = np.prod(factors)
integrand = (factor*gf1*self.weights2D*gf2*self.dRdX)
integrand[-1] = 0
integral = np.sum(integrand)
return integral
def get_integration_weights(order,nodes=None):
"""
Returns the integration weights for Gauss-Lobatto quadrature
as a function of the order of the polynomial we want to
represent.
See: https://en.wikipedia.org/wiki/Gaussian_quadrature
See: arXive:gr-qc/0609020v1
"""
if np.all(nodes == False):
nodes=get_quadrature_points(order)
if poly == polynomial.chebyshev.Chebyshev:
weights = np.empty((order+1))
weights[1:-1] = np.pi/order
weights[0] = np.pi/(2*order)
weights[-1] = weights[0]
return weights
elif poly == polynomial.legendre.Legendre:
interior_weights = 2/((order+1)*order*poly.basis(order)(nodes[1:-1])**2)
boundary_weights = np.array([1-0.5*np.sum(interior_weights)])
weights = np.concatenate((boundary_weights,
interior_weights,
boundary_weights))
return weights
else:
raise ValueError("Not a known polynomial type.")
return False
def inner_product(self,gf1,gf2):
"""Calculates the 2D inner product between grid functions
gf1 and gf2 using the appropriate quadrature rule
"""
factors = [s.get_scale_factor() for s in self.stencils]
factor = np.prod(factors)
integrand = gf1*self.weights2D*gf2
integral_unit_cell = np.sum(integrand)
integral_physical = integral_unit_cell*factor
return integral_physical
def compute_rhs(rhs):
U_dealiased = work[((3,) + FFT.work_shape(dealias), float, 0)]
curl_dealiased = work[((3,) + FFT.work_shape(dealias), float, 1)]
for i in range(3):
U_dealiased[i] = FFT.ifftn(U_hat[i], U_dealiased[i], dealias)
curl_dealiased = curl(U_hat, curl_dealiased)
rhs = cross(U_dealiased, curl_dealiased, rhs)
P_hat[:] = sum(rhs*K_over_K2, 0, out=P_hat)
rhs -= P_hat*K
rhs -= nu*K2*U_hat
return rhs
# Initialize a Taylor Green vortex
def gof(self):
r2mean = np.mean(self.r2.T[self.endoexo()[0]].values)
AVEmean = self.AVE().copy()
totalblock = 0
for i in range(self.lenlatent):
block = self.data_[self.Variables['measurement']
[self.Variables['latent'] == self.latent[i]]]
block = len(block.columns.values)
totalblock += block
AVEmean[self.latent[i]] = AVEmean[self.latent[i]] * block
AVEmean = np.sum(AVEmean) / totalblock
return np.sqrt(AVEmean * r2mean)
def cr(self):
# Composite Reliability
composite = pd.DataFrame(0, index=np.arange(1), columns=self.latent)
for i in range(self.lenlatent):
block = self.data_[self.Variables['measurement']
[self.Variables['latent'] == self.latent[i]]]
p = len(block.columns)
if(p != 1):
cor_mat = np.cov(block.T)
evals, evecs = np.linalg.eig(cor_mat)
U, S, V = np.linalg.svd(cor_mat, full_matrices=False)
indices = np.argsort(evals)
indices = indices[::-1]
evecs = evecs[:, indices]
evals = evals[indices]
loadings = V[0, :] * np.sqrt(evals[0])
numerador = np.sum(abs(loadings))**2
denominador = numerador + (p - np.sum(loadings ** 2))
cr = numerador / denominador
composite[self.latent[i]] = cr
else:
composite[self.latent[i]] = 1
composite = composite.T
return(composite)
def r2adjusted(self):
n = len(self.data_)
r2 = self.r2.values
r2adjusted = pd.DataFrame(0, index=np.arange(1), columns=self.latent)
for i in range(self.lenlatent):
p = sum(self.LVariables['target'] == self.latent[i])
r2adjusted[self.latent[i]] = r2[i] - \
(p * (1 - r2[i])) / (n - p - 1)
return r2adjusted.T
def AVE(self):
# AVE
return self.comunalidades().apply(lambda column: column.sum() / (column != 0).sum())
def fornell(self):
cor_ = pd.DataFrame.corr(self.fscores)**2
AVE = self.comunalidades().apply(lambda column: column.sum() / (column != 0).sum())
for i in range(len(cor_)):
cor_.ix[i, i] = AVE[i]
return(cor_)
def fitness(self, data_, n_clusters, lvmodel, mvmodel, scheme, regression):
output = pd.DataFrame(self.genes)
output.columns = ['Split']
dataSplit = pd.concat([data_, output], axis=1)
f1 = []
results = []
for i in range(n_clusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, lvmodel, mvmodel, scheme,
regression, 0, 50, HOC='true'))
sumOuterResid = pd.DataFrame.sum(
pd.DataFrame.sum(results[i].residuals()[1]**2))
sumInnerResid = pd.DataFrame.sum(
pd.DataFrame.sum(results[i].residuals()[2]**2))
f1.append(sumOuterResid + sumInnerResid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def roulettewheel(pop, fit):
fit = fit - min(fit)
sumf = sum(fit)
if(sumf == 0):
return pop[0]
prob = [(item + sum(fit[:index])) / sumf for index, item in enumerate(fit)]
prob_ = uniform(0, 1)
# print(prob)
individuo = (int(BinSearch(prob, prob_, 0, len(prob) - 1)))
return pop[individuo]