def remove_artifacts(self, image):
"""
Remove the connected components that are not within the parameters
Operates in place
:param image: sudoku's thresholded image w/o grid
:return: None
"""
labeled, features = label(image, structure=CROSS)
lbls = np.arange(1, features + 1)
areas = extract_feature(image, labeled, lbls, np.sum,
np.uint32, 0)
sides = extract_feature(image, labeled, lbls, min_side,
np.float32, 0, True)
diags = extract_feature(image, labeled, lbls, diagonal,
np.float32, 0, True)
for index in lbls:
area = areas[index - 1] / 255
side = sides[index - 1]
diag = diags[index - 1]
if side < 5 or side > 20 \
or diag < 15 or diag > 25 \
or area < 40:
image[labeled == index] = 0
return None
python类sum()的实例源码
def remove_artifacts(self, image):
"""
Remove the connected components that are not within the parameters
Operates in place
:param image: sudoku's thresholded image w/o grid
:return: None
"""
labeled, features = label(image, structure=CROSS)
lbls = np.arange(1, features + 1)
areas = extract_feature(image, labeled, lbls, np.sum,
np.uint32, 0)
sides = extract_feature(image, labeled, lbls, min_side,
np.float32, 0, True)
diags = extract_feature(image, labeled, lbls, diagonal,
np.float32, 0, True)
for index in lbls:
area = areas[index - 1] / 255
side = sides[index - 1]
diag = diags[index - 1]
if side < 5 or side > 20 \
or diag < 15 or diag > 25 \
or area < 40:
image[labeled == index] = 0
return None
def evaluate(self, dataset):
predictions = self.predict(dataset[:,0])
confusion_matrix = sklearn_confusion_matrix(dataset[:,1], predictions, labels=self.__classes)
precisions = []
recalls = []
accuracies = []
for gender in self.__classes:
idx = self.__classes_indexes[gender]
precision = 1
recall = 1
if np.sum(confusion_matrix[idx,:]) > 0:
precision = confusion_matrix[idx][idx]/np.sum(confusion_matrix[idx,:])
if np.sum(confusion_matrix[:, idx]) > 0:
recall = confusion_matrix[idx][idx]/np.sum(confusion_matrix[:, idx])
precisions.append(precision)
recalls.append(recall)
precision = np.mean(precisions)
recall = np.mean(recalls)
f1 = (2*(precision*recall))/float(precision+recall)
accuracy = np.sum(confusion_matrix.diagonal())/float(np.sum(confusion_matrix))
return precision, recall, accuracy, f1
def reshape_array(array, newsize, pixcombine='sum'):
"""
Reshape an array to a give size using either the sum, mean or median of the pixels binned
Note that the old array dimensions have to be multiples of the new array dimensions
--- INPUT ---
array Array to reshape (combine pixels)
newsize New size of array
pixcombine The method to combine the pixels with. Choices are sum, mean and median
"""
sh = newsize[0],array.shape[0]//newsize[0],newsize[1],array.shape[1]//newsize[1]
pdb.set_trace()
if pixcombine == 'sum':
reshapedarray = array.reshape(sh).sum(-1).sum(1)
elif pixcombine == 'mean':
reshapedarray = array.reshape(sh).mean(-1).mean(1)
elif pixcombine == 'median':
reshapedarray = array.reshape(sh).median(-1).median(1)
return reshapedarray
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
# print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
return (1 / np.sum(f1))
# Main
def xloads(self):
# Xloadings
A = self.data_.transpose().values
B = self.fscores.transpose().values
A_mA = A - A.mean(1)[:, None]
B_mB = B - B.mean(1)[:, None]
ssA = (A_mA**2).sum(1)
ssB = (B_mB**2).sum(1)
xloads_ = (np.dot(A_mA, B_mB.T) /
np.sqrt(np.dot(ssA[:, None], ssB[None])))
xloads = pd.DataFrame(
xloads_, index=self.manifests, columns=self.latent)
return xloads
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_pso(self, item):
output = pd.DataFrame(self.population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_tabu(self, item):
output = pd.DataFrame(self.population[item])
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
cost = (np.sum(f1))
print(1 / cost)
return [self.population[item], cost]
def KMO(data):
cor_ = pd.DataFrame.corr(data)
invCor = np.linalg.inv(cor_)
rows = cor_.shape[0]
cols = cor_.shape[1]
A = np.ones((rows, cols))
for i in range(rows):
for j in range(i, cols):
A[i, j] = - (invCor[i, j]) / (np.sqrt(invCor[i, i] * invCor[j, j]))
A[j, i] = A[i, j]
num = np.sum(np.sum((cor_)**2)) - np.sum(np.sum(np.diag(cor_**2)))
den = num + (np.sum(np.sum(A**2)) - np.sum(np.sum(np.diag(A**2))))
kmo = num / den
return kmo
def xavier_initializer(shape):
dim_sum = np.sum(shape)
if len(shape) == 1:
dim_sum += 1
bound = np.sqrt(2.0 / dim_sum)
return tf.random_uniform(shape, minval=-bound, maxval=bound)
# # Assigning network variables to target network variables
# def assign_network_to_target():
# update_wfc = tf.assign(w_fc_target, w_fc)
# update_bfc = tf.assign(b_fc_target, b_fc)
# sess.run(update_wfc)
# sess.run(update_bfc)
# cell_target = cell
# Input
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
# A different (control flow based) way to control dropout
if self.training:
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
else:
x = F.relu(F.max_pool2d(self.conv2(x), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
if self.training:
x = F.dropout(x, training=True)
x = self.fc2(x)
# Check for NaNs and infinites
nans = np.sum(np.isnan(x.data.numpy()))
infs = np.sum(np.isinf(x.data.numpy()))
if nans > 0:
print("There is {} NaN at the output layer".format(nans))
if infs > 0:
print("There is {} infinite values at the output layer".format(infs))
return F.log_softmax(x)
def test():
model.eval()
test_loss = 0
correct = 0
for data, target in test_loader:
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target)
output = model(data)
test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
pred = output.data.max(1)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def score_samples(self, X):
"""Return the log-likelihood of each sample
See. "Pattern Recognition and Machine Learning"
by C. Bishop, 12.2.1 p. 574
or http://www.miketipping.com/papers/met-mppca.pdf
Parameters
----------
X: array, shape(n_samples, n_features)
The data.
Returns
-------
ll: array, shape (n_samples,)
Log-likelihood of each sample under the current model
"""
check_is_fitted(self, 'mean_')
X = check_array(X)
Xr = X - self.mean_
n_features = X.shape[1]
log_like = np.zeros(X.shape[0])
precision = self.get_precision()
log_like = -.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1)
log_like -= .5 * (n_features * log(2. * np.pi)
- fast_logdet(precision))
return log_like
def main():
files = tf.gfile.Glob(flags.FLAGS.src_path_1)
labels_uni = np.zeros([4716,1])
labels_matrix = np.zeros([4716,4716])
for file in files:
labels_all = get_video_input_feature(file)
print(len(labels_all[0][2]),len(labels_all[0][3]),len(labels_all[0][4]),len(labels_all[0][5]))
"""
for labels in labels_all:
for i in range(len(labels)):
labels_uni[labels[i]] += 1
for j in range(len(labels)):
labels_matrix[labels[i],labels[j]] += 1
labels_matrix = labels_matrix/labels_uni
labels_matrix = labels_matrix/(np.sum(labels_matrix,axis=0)-1.0)
for i in range(4716):
labels_matrix[i,i] = 1.0
np.savetxt('labels_uni.out', labels_uni, delimiter=',')
np.savetxt('labels_matrix.out', labels_matrix, delimiter=',')"""
def calculate_gap(predictions, actuals, top_k=20):
"""Performs a local (numpy) calculation of the global average precision.
Only the top_k predictions are taken for each of the videos.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
top_k: How many predictions to use per video.
Returns:
float: The global average precision.
"""
gap_calculator = ap_calculator.AveragePrecisionCalculator()
sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
return gap_calculator.peek_ap_at_n()
def calculate_gap(predictions, actuals, top_k=20):
"""Performs a local (numpy) calculation of the global average precision.
Only the top_k predictions are taken for each of the videos.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
top_k: How many predictions to use per video.
Returns:
float: The global average precision.
"""
gap_calculator = ap_calculator.AveragePrecisionCalculator()
sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
return gap_calculator.peek_ap_at_n()
def format_lines(video_ids, predictions, labels, top_k):
batch_size = len(video_ids)
for video_index in range(batch_size):
n_recall = max(int(numpy.sum(labels[video_index])), 1)
# labels
label_indices = numpy.argpartition(labels[video_index], -n_recall)[-n_recall:]
label_predictions = [(class_index, predictions[video_index][class_index])
for class_index in label_indices]
label_predictions = sorted(label_predictions, key=lambda p: -p[1])
label_str = "\t".join(["%d\t%f"%(x,y) for x,y in label_predictions])
# predictions
top_k_indices = numpy.argpartition(predictions[video_index], -top_k)[-top_k:]
top_k_predictions = [(class_index, predictions[video_index][class_index])
for class_index in top_k_indices]
top_k_predictions = sorted(top_k_predictions, key=lambda p: -p[1])
top_k_str = "\t".join(["%d\t%f"%(x,y) for x,y in top_k_predictions])
# compute PERR
top_n_indices = numpy.argpartition(predictions[video_index], -n_recall)[-n_recall:]
positives = [labels[video_index][class_index]
for class_index in top_n_indices]
perr = sum(positives) / float(n_recall)
# URL
url = "https://www.youtube.com/watch?v=" + video_ids[video_index].decode('utf-8')
yield url + "\t" + str(1-perr) + "\t" + top_k_str + "\t" + label_str + "\n"
def calculate_gap(predictions, actuals, top_k=20):
"""Performs a local (numpy) calculation of the global average precision.
Only the top_k predictions are taken for each of the videos.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
top_k: How many predictions to use per video.
Returns:
float: The global average precision.
"""
gap_calculator = ap_calculator.AveragePrecisionCalculator()
sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
return gap_calculator.peek_ap_at_n()
def getTrainKernel(self, params):
self.checkParams(params)
if (self.sameParams(params)): return self.cache['getTrainKernel']
ell = np.exp(params[0])
if (self.K_sq is None): K = sq_dist(self.X_scaled.T / ell) #precompute squared distances
else: K = self.K_sq / ell**2
self.cache['K_sq_scaled'] = K
# # # #manual computation (just for sanity checks)
# # # K1 = np.exp(-K / 2.0)
# # # K2 = np.zeros((self.X_scaled.shape[0], self.X_scaled.shape[0]))
# # # for i1 in xrange(self.X_scaled.shape[0]):
# # # for i2 in xrange(i1, self.X_scaled.shape[0]):
# # # diff = self.X_scaled[i1,:] - self.X_scaled[i2,:]
# # # K2[i1, i2] = np.exp(-np.sum(diff**2) / (2*ell))
# # # K2[i2, i1] = K2[i1, i2]
# # # print np.max((K1-K2)**2)
# # # sys.exit(0)
K_exp = np.exp(-K / 2.0)
self.cache['getTrainKernel'] = K_exp
self.saveParams(params)
return K_exp
def getTrainTestKernel(self, params, Xtest):
self.checkParams(params)
ell2 = np.exp(2*params[0])
z = Xtest / np.sqrt(Xtest.shape[1])
S = 1 + self.X_scaled.dot(z.T)
sz = 1 + np.sum(z**2, axis=1)
sqrtEll2Psx = np.sqrt(ell2+self.sx)
sqrtEll2Psz = np.sqrt(ell2+sz)
K = S / np.outer(sqrtEll2Psx, sqrtEll2Psz)
return np.arcsin(K)
def match_matrix(event: Event):
"""Returns a numpy participation matrix for the qualification matches in this event, used for calculating OPR.
Each row in the matrix corresponds to a single alliance in a match, meaning that there will be two rows (one for
red, one for blue) per match. Each column represents a single team, ordered by team number. If a team participated
on a certain alliance, the value at that row and column would be 1, otherwise, it would be 0. For example, an
event with teams 1-7 that featured a match that pitted teams 1, 3, and 5 against 2, 4, and 6 would have a match
matrix that looks like this (sans labels):
#1 #2 #3 #4 #5 #6 #7
qm1_red 1 0 1 0 1 0 0
qm1_blue 0 1 0 1 0 1 0
"""
match_list = []
for match in filter(lambda match: match['comp_level'] == 'qm', event.matches):
matchRow = []
for team in event.teams:
matchRow.append(1 if team['key'] in match['alliances']['red']['teams'] else 0)
match_list.append(matchRow)
matchRow = []
for team in event.teams:
matchRow.append(1 if team['key'] in match['alliances']['blue']['teams'] else 0)
match_list.append(matchRow)
mat = numpy.array(match_list)
sum_matches = numpy.sum(mat, axis=0)
avg_team_matches = sum(sum_matches) / float(len(sum_matches))
return mat[:, numpy.apply_along_axis(numpy.count_nonzero, 0, mat) > avg_team_matches - 2]
def compute_angle(pt0, pt1, pt2):
"""
Given 3 points, compute the cosine of the angle from pt0
:type pt0: numpy.array
:type pt1: numpy.array
:type pt2: numpy.array
:return: cosine of angle
"""
a = pt0 - pt1
b = pt0 - pt2
return (np.sum(a * b)) / (np.linalg.norm(a) * np.linalg.norm(b))
def _zoning(image):
"""
It works better with DSIZE = 28
~0.9967 precision and recall
:param image:
:return: #pixels/area ratio of each zone (7x7) as feature vector
"""
zones = []
for i in range(0, 28, 7):
for j in range(0, 28, 7):
roi = image[i:i+7, j:j+7]
val = (np.sum(roi)/255) / 49.
zones.append(val)
return np.array(zones, np.float32)
def getTypeProblem (self, solution_filename):
''' Get the type of problem directly from the solution file (in case we do not have an info file)'''
if 'task' not in self.info.keys():
solution = np.array(data_converter.file_to_array(solution_filename))
target_num = solution.shape[1]
self.info['target_num']=target_num
if target_num == 1: # if we have only one column
solution = np.ravel(solution) # flatten
nbr_unique_values = len(np.unique(solution))
if nbr_unique_values < len(solution)/8:
# Classification
self.info['label_num'] = nbr_unique_values
if nbr_unique_values == 2:
self.info['task'] = 'binary.classification'
self.info['target_type'] = 'Binary'
else:
self.info['task'] = 'multiclass.classification'
self.info['target_type'] = 'Categorical'
else:
# Regression
self.info['label_num'] = 0
self.info['task'] = 'regression'
self.info['target_type'] = 'Numerical'
else:
# Multilabel or multiclass
self.info['label_num'] = target_num
self.info['target_type'] = 'Binary'
if any(item > 1 for item in map(np.sum,solution.astype(int))):
self.info['task'] = 'multilabel.classification'
else:
self.info['task'] = 'multiclass.classification'
return self.info['task']
def binarize_predictions(array, task='binary.classification'):
''' Turn predictions into decisions {0,1} by selecting the class with largest
score for multiclass problems and thresholding at 0.5 for other cases.'''
# add a very small random value as tie breaker (a bit bad because this changes the score every time)
# so to make sure we get the same result every time, we seed it
#eps = 1e-15
#np.random.seed(sum(array.shape))
#array = array + eps*np.random.rand(array.shape[0],array.shape[1])
bin_array = np.zeros(array.shape)
if (task != 'multiclass.classification') or (array.shape[1]==1):
bin_array[array>=0.5] = 1
else:
sample_num=array.shape[0]
for i in range(sample_num):
j = np.argmax(array[i,:])
bin_array[i,j] = 1
return bin_array
def acc_stat (solution, prediction):
''' Return accuracy statistics TN, FP, TP, FN
Assumes that solution and prediction are binary 0/1 vectors.'''
# This uses floats so the results are floats
TN = sum(np.multiply((1-solution), (1-prediction)))
FN = sum(np.multiply(solution, (1-prediction)))
TP = sum(np.multiply(solution, prediction))
FP = sum(np.multiply((1-solution), prediction))
#print "TN =",TN
#print "FP =",FP
#print "TP =",TP
#print "FN =",FN
return (TN, FP, TP, FN)
def pac_metric (solution, prediction, task='binary.classification'):
''' Probabilistic Accuracy based on log_loss metric.
We assume the solution is in {0, 1} and prediction in [0, 1].
Otherwise, run normalize_array.'''
debug_flag=False
[sample_num, label_num] = solution.shape
if label_num==1: task='binary.classification'
eps = 1e-15
the_log_loss = log_loss(solution, prediction, task)
# Compute the base log loss (using the prior probabilities)
pos_num = 1.* sum(solution) # float conversion!
frac_pos = pos_num / sample_num # prior proba of positive class
the_base_log_loss = prior_log_loss(frac_pos, task)
# Alternative computation of the same thing (slower)
# Should always return the same thing except in the multi-label case
# For which the analytic solution makes more sense
if debug_flag:
base_prediction = np.empty(prediction.shape)
for k in range(sample_num): base_prediction[k,:] = frac_pos
base_log_loss = log_loss(solution, base_prediction, task)
diff = np.array(abs(the_base_log_loss-base_log_loss))
if len(diff.shape)>0: diff=max(diff)
if(diff)>1e-10:
print('Arrggh {} != {}'.format(the_base_log_loss,base_log_loss))
# Exponentiate to turn into an accuracy-like score.
# In the multi-label case, we need to average AFTER taking the exp
# because it is an NL operation
pac = mvmean(np.exp(-the_log_loss))
base_pac = mvmean(np.exp(-the_base_log_loss))
# Normalize: 0 for random, 1 for perfect
score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac))
return score
def auc_metric(solution, prediction, task='binary.classification'):
''' Normarlized Area under ROC curve (AUC).
Return Gini index = 2*AUC-1 for binary classification problems.
Should work for a vector of binary 0/1 (or -1/1)"solution" and any discriminant values
for the predictions. If solution and prediction are not vectors, the AUC
of the columns of the matrices are computed and averaged (with no weight).
The same for all classification problems (in fact it treats well only the
binary and multilabel classification problems).'''
#auc = metrics.roc_auc_score(solution, prediction, average=None)
# There is a bug in metrics.roc_auc_score: auc([1,0,0],[1e-10,0,0]) incorrect
label_num=solution.shape[1]
auc=np.empty(label_num)
for k in range(label_num):
r_ = tiedrank(prediction[:,k])
s_ = solution[:,k]
if sum(s_)==0: print('WARNING: no positive class example in class {}'.format(k+1))
npos = sum(s_==1)
nneg = sum(s_<1)
auc[k] = (sum(r_[s_==1]) - npos*(npos+1)/2) / (nneg*npos)
return 2*mvmean(auc)-1
### END CLASSIFICATION METRICS
# ======= Specialized scores ========
# We run all of them for all tasks even though they don't make sense for some tasks