def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5)
x, y = online.collect_pts(100, -1)
i = 0
q = online.get_n_query()
C_range = np.logspace(-2, 5, 10, base=10)
gamma_range = np.logspace(-5, 1, 10, base=10)
param_grid = dict(gamma=gamma_range, C=C_range)
while q < 3500:
i += 1
# h_ = ex.fit(x, y)
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
grid.fit(x, y)
h_ = grid.best_estimator_
online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1)
x_, _ = online_.collect_pts(10, 200)
if x_ is not None and len(x_) > 0:
x.extend(x_)
y.extend(oracle(x_))
q += online_.get_n_query()
pred_y = h_.predict(test_x)
print len(x), q, sm.accuracy_score(test_y, pred_y)
python类StratifiedShuffleSplit()的实例源码
def grid_retrain_in_f(self, n_dim=500):
rbf_map = RBFSampler(n_dim, random_state=1)
fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map),
("svm", LinearSVC())])
# C_range = np.logspace(-5, 15, 21, base=2)
# gamma_range = np.logspace(-15, 3, 19, base=2)
# param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range)
# cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42)
# grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv)
# grid.fit(X, Y)
#
# rbf_svc2 = grid.best_estimator_
rbf_svc2 = fourier_approx_svm
rbf_svc2.fit(self.X_ex, self.y_ex)
self.set_clf2(rbf_svc2)
return self.benchmark()
def grid_search(self):
C_range = np.logspace(-5, 15, 21, base=2)
param_grid = dict(C=C_range)
cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(kernel='poly', max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0)
logger.info('start grid search for Linear')
grid.fit(self.X_ex, self.y_ex)
logger.info('end grid search for Linear')
scores = [x[1] for x in grid.grid_scores_]
# final train
clf = grid.best_estimator_
pred_train = clf.predict(self.X_ex)
pred_val = clf.predict(self.val_x)
pred_test = clf.predict(self.test_x)
r = Result(self.name + ' (X)', 'Poly', len(self.X_ex),
sm.accuracy_score(self.y_ex, pred_train),
sm.accuracy_score(self.val_y, pred_val),
sm.accuracy_score(self.test_y, pred_test))
return r
def test_stratified_shuffle_split_init():
y = np.asarray([0, 1, 1, 1, 2, 2, 2])
# Check that error is raised if there is a class with only one sample
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)
# Check that error is raised if the test set size is smaller than n_classes
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
# Check that error is raised if the train set size is smaller than
# n_classes
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)
y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
# Check that errors are raised if there is not enough samples
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)
# Train size or test size too small
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)
def test_stratified_shuffle_split_iter():
ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
np.array([-1] * 800 + [1] * 50)
]
for y in ys:
sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
random_state=0)
for train, test in sss:
assert_array_equal(np.unique(y[train]), np.unique(y[test]))
# Checks if folds keep classes proportions
p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1])
/ float(len(y[train])))
p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1])
/ float(len(y[test])))
assert_array_almost_equal(p_train, p_test, 1)
assert_equal(y[train].size + y[test].size, y.size)
assert_array_equal(np.intersect1d(train, test), [])
def split_indices_old(files, labels, test_size=0.1, random_state=RANDOM_STATE):
names = get_names(files)
labels = get_labels(names, per_patient=True)
spl = cross_validation.StratifiedShuffleSplit(labels[:, 0],
test_size=test_size,
random_state=random_state,
n_iter=1)
tr, te = next(iter(spl))
tr = np.hstack([tr * 2, tr * 2 + 1])
te = np.hstack([te * 2, te * 2 + 1])
return tr, te
def split_indices(files, labels, label_file, test_size=0.1, random_state=RANDOM_STATE): # <-- Necessary for running with training on melanoma database, not using per_patient
names = get_names(files)
labels = get_labels(names, label_file=label_file, per_patient=False)
spl = cross_validation.StratifiedShuffleSplit(labels,
test_size=test_size,
random_state=random_state,
n_iter=1)
tr, te = next(iter(spl))
return tr, te
def hot(X, y):
C_range = np.logspace(-15, 15, 31,base = 2.0)
gamma_range = np.logspace(-15, 15, 31, base = 2.0)
# param_grid = dict(gamma=gamma_range, C=C_range)
# cv = StratifiedShuffleSplit(y, n_iter=10, test_size=0.2, random_state=42)
roc_auc_scorer = get_scorer("roc_auc")
scores = []
for C in C_range:
for gamma in gamma_range:
auc_scorer = []
for train, test in KFold(n=len(X), n_folds=10, random_state=42):
rbf_svc = svm.SVC(C=C, kernel='rbf', gamma=gamma, probability=True)
X_train, y_train = X[train], y[train]
X_test, y_test = X[test], y[test]
rbf_clf = rbf_svc.fit(X_train, y_train)
auc_scorer.append(roc_auc_scorer(rbf_clf, X_test, y_test))
scores.append(np.mean(auc_scorer))
# grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
# grid.fit(X, y)
# scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(C_range), len(gamma_range))
print scores
plt.figure(figsize=(15, 12))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=90)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('AUC')
plt.show()
def _train_val_split_indices(labels):
split = StratifiedShuffleSplit(
labels, n_iter=1, test_size=VAL_SIZE, random_state=42)
indices_tr, indices_val = next(iter(split))
_save_organized_data_info(
split.classes, indices_tr, indices_val, multi_crop=False)
_save_organized_data_info(
split.classes, indices_tr, indices_val, multi_crop=True)
return indices_tr, indices_val, split.classes
def cross_predict(feat, f_name, X=X, y=y):
if os.name == 'nt':
n_jobs = 1
else:
n_jobs = -1
# ????
# clf_1 = MultinomialNB(alpha=5)
clf_2 = LinearSVC(C=0.02)
# ???? (CV)
# This cross-validation object is a merge of StratifiedKFold and ShuffleSplit,
# which returns stratified randomized folds. The folds are made by preserving
# the percentage of samples for each class.
#
# Note: like the ShuffleSplit strategy, stratified random splits do not guarantee
# that all folds will be different, although this is still
# very likely for sizeable datasets.
#
# Pass this cv to cross_val_predict will raise
# ValueError:cross_val_predict only works for partitions
#
# ? cv ?????? fold ? fold ????????
# cv = cross_validation.StratifiedShuffleSplit(y, test_size=0.2, random_state=42)
# This cross-validation object is a variation of KFold that returns stratified folds.
# The folds are made by preserving the percentage of samples for each class.
cv = cross_validation.StratifiedKFold(y, n_folds=5, random_state=42)
model = Pipeline([('feat', feat), ('clf', clf_2)])
t0 = time()
y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=n_jobs, cv=cv)
t = time() - t0
print("=" * 20, f_name, "=" * 20)
print("time cost: {}".format(t))
# print("y_predict: {}".format(y_pred))
print()
print('confusion matrix:\n', confusion_matrix(y, y_pred))
print()
print('\t\taccuracy: {}'.format(accuracy_score(y, y_pred)))
print()
print("\t\tclassification report")
print("-" * 52)
print(classification_report(y, y_pred))
# ??
# ???? (tfidf: baseline feature)
def do(self, n_pts):
X, y = self.collect_pts(n_pts)
print 'done collecting points'
rbf_map = RBFSampler(n_components=n_pts, random_state=1)
solver = HyperSolver(p=self.POS, n=self.NEG)
rbf_solver = pipeline.Pipeline([("mapper", rbf_map),
("solver", solver)])
gamma_range = np.logspace(-15, 6, 22, base=2)
param_grid = dict(mapper__gamma=gamma_range)
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=1)
grid = GridSearchCV(rbf_solver, param_grid=param_grid, cv=cv, n_jobs=8)
grid.fit(X, y)
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(gamma_range))
plt.figure(figsize=(8, 6))
plt.plot(gamma_range, scores)
plt.xlabel('gamma')
plt.ylabel('score')
plt.title('Validation accuracy (RTiX, %s)' % os.path.basename(self.name))
plt.savefig(self.name + '-SLViF-grid-npts=%d.pdf' % n_pts)
# final train
g = grid.best_params_['mapper__gamma']
print 'best parameters are g=%f' % g
rbf_svc2 = grid.best_estimator_
y_pred = rbf_svc2.predict(self.Xt)
print 'SCORE: %f' % sm.accuracy_score(self.Yt, y_pred)
return grid.best_score_, sm.accuracy_score(self.Yt, y_pred)
def grid_search(self):
C_range = np.logspace(-5, 15, 21, base=2)
param_grid = dict(C=C_range)
cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(LinearSVC(dual=False, max_iter=10000), param_grid=param_grid,
cv=cv,
n_jobs=1, verbose=0)
logger.info('start grid search for Linear')
grid.fit(self.X_ex, self.y_ex)
logger.info('end grid search for Linear')
scores = [x[1] for x in grid.grid_scores_]
# final train
rbf_svc2 = grid.best_estimator_
pred_train = rbf_svc2.predict(self.X_ex)
pred_val = rbf_svc2.predict(self.val_x)
pred_test = rbf_svc2.predict(self.test_x)
r = Result(self.name + ' (X)', 'Linear', len(self.X_ex),
sm.accuracy_score(self.y_ex, pred_train),
sm.accuracy_score(self.val_y, pred_val),
sm.accuracy_score(self.test_y, pred_test))
return r
def balancedSplit(X, y, seed, test_sz=1000):
stratSplit = StratifiedShuffleSplit(
y, 1, test_size=test_sz, random_state=seed
)
for train_idx, test_idx in stratSplit:
X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]
break
return X_train, y_train, X_test, y_test
def getBalancedSample(y, seed, test_sz=1000):
if y.shape[0] == test_sz:
return np.arange(test_sz)
else:
stratSplit = StratifiedShuffleSplit(
y, 1, test_size=test_sz, random_state=seed
)
for _, test_idx in stratSplit:
idx = test_idx
break
return idx
def get_data():
lmdb_env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_train_lmdb//')
lmdb_txn = lmdb_env.begin()
lmdb_cursor = lmdb_txn.cursor()
datum = caffe.proto.caffe_pb2.Datum()
x=[]
y=[]
for key, value in lmdb_cursor:
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
x.append(data)
y.append(label)
x=np.array(x)
y=np.array(y)
map_size = int(1e12)
# Don't need to shuffle 3 times
sss = StratifiedShuffleSplit(y, 3, test_size=0.2, random_state=0)
for train_index, test_index in sss:
ind_train=train_index
ind_test=test_index
env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_evenval_lmdb2/', map_size=map_size)
with env.begin(write=True) as txn:
# txn is a Transaction object
for i in range(10000):
im_dat = caffe.io.array_to_datum(x[ind_test][i],y[ind_test][i])
txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
#map_size = x.nbytes * 10
del env
env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_eventrain_lmdb2/', map_size=map_size)
with env.begin(write=True) as txn:
# txn is a Transaction object
for i in range(40000):
im_dat = caffe.io.array_to_datum(x[ind_train][i],y[ind_train][i])
txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
def make_train_val():
print 'Loading Matlab data.'
f = '/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mnist_rotation_back_image_new/mnist_all_background_images_rotation_normalized_train_valid.amat'
X,Y=get_data(f)
N = Y.shape[0]
map_size = X.nbytes*2
#if you want to shuffle your data
#random.shuffle(N)
sss = StratifiedShuffleSplit(Y, 3, test_size=2000, random_state=0)
for train_index, test_index in sss:
ind_train1=train_index
ind_val1=test_index
print len(ind_train1),len(ind_val1)
env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mrbi_train', map_size=map_size*5/6)
with env.begin(write=True) as txn:
# txn is a Transaction object
for i in range(len(ind_train1)):
im_dat = caffe.io.array_to_datum(X[ind_train1[i]],Y[ind_train1[i]])
txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mrbi_val', map_size=map_size/6)
with env.begin(write=True) as txn:
# txn is a Transaction object
for i in range(len(ind_val1)):
im_dat = caffe.io.array_to_datum(X[ind_val1[i]],Y[ind_val1[i]])
txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
def train_test_split_shuffle(target, features, test_size = 0.1):
sss = StratifiedShuffleSplit(target, 1, test_size = test_size, random_state=0)
for train_index, test_index in sss:
X_train, X_test = features[train_index], features[test_index]
y_train, y_test = target[train_index], target[test_index]
y_test = y_test.values
y_train = y_train.values
return X_train, y_train, X_test, y_test
def test_stratified_shuffle_split_overlap_train_test_bug():
# See https://github.com/scikit-learn/scikit-learn/issues/6121 for
# the original bug report
labels = [0, 1, 2, 3] * 3 + [4, 5] * 5
splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
test_size=0.5, random_state=0)
train, test = next(iter(splits))
assert_array_equal(np.intersect1d(train, test), [])
def test_classifier(clf, dataset, feature_list, folds = 1000):
data = featureFormat(dataset, feature_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for train_idx, test_idx in cv:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)
for prediction, truth in zip(predictions, labels_test):
if prediction == 0 and truth == 0:
true_negatives += 1
elif prediction == 0 and truth == 1:
false_negatives += 1
elif prediction == 1 and truth == 0:
false_positives += 1
elif prediction == 1 and truth == 1:
true_positives += 1
else:
print "Warning: Found a predicted label not == 0 or 1."
print "All predictions should take value 0 or 1."
print "Evaluating performance for processed predictions:"
break
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
print clf
print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
print ""
except:
print "Got a divide by zero when trying out:", clf
print "Precision or recall may be undefined due to a lack of true positive predicitons."
def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5)
q = online.get_n_query()
C_range = np.logspace(-2, 5, 10, base=10)
gamma_range = np.logspace(-5, 1, 10, base=10)
param_grid = dict(gamma=gamma_range, C=C_range)
x, y = online.collect_pts(100, -1)
i = 0
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
grid.fit(x, y)
h_ = grid.best_estimator_
while q < 3500:
i += 1
# h_ = ex.fit(x, y)
online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1)
x_ = online_.collect_one_pair()
if x_ is not None and len(x_) > 0:
for _x in x_:
x.append(_x)
y.append(1)
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
grid.fit(x, y)
h1 = grid.best_estimator_
s1 = sm.accuracy_score(y, h1.predict(x))
y[-1] = -1
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
grid.fit(x, y)
h2 = grid.best_estimator_
s2 = sm.accuracy_score(y, h2.predict(x))
if s1 >= .99 and s2 >= .99:
print 'branch 1'
y[-1] = oracle(x_)[0]
elif s1 >= .99 and s2 < .99:
print 'branch 2'
y[-1] = 1
elif s1 < .99 and s2 >= .99:
print 'branch 3'
y[-1] = -1
else:
print 'branch 4: ', s1, s2
del x[-1]
del y[-1]
continue
if y[-1] == 1:
h_ = h1
else:
h_ = h2
q += online_.get_n_query()
pred_y = h_.predict(test_x)
print q, sm.accuracy_score(test_y, pred_y)
def do(self):
# get some initial points
self.ex.collect_up_to_budget(self.budget_per_round * 2)
x, y = self.ex.pts_near_b, self.ex.pts_near_b_labels
if len(np.unique(y)) < 2:
return 1, 1
# gamma_range = np.logspace(-5, 1, 10, base=10)
# param_grid = dict(gamma=gamma_range)
try:
# cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2)
# grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1)
# grid.fit(x, y)
# h_best = grid.best_estimator_
raise ValueError
except ValueError:
h_best = svm.SVC(C=1e5)
h_best.fit(x, y)
for i in range(1, self.n_rounds - 1):
online_ = OnlineBase('', +1, self.NEG, h_best.predict, self.n_features, 'uniform', error=.1)
x_, _ = online_.collect_pts(self.budget_per_round, 50000) # budget doesn't matter
xx_ = None
if x_ is None or len(x_) < self.budget_per_round:
print('Run out of budget when getting x_')
xx_ = np.random.uniform(-1, 1, (self.budget_per_round - len(x_), self.n_features))
if x_ is not None and len(x_) > 0:
x.extend(x_)
y.extend(self.oracle(x_))
if xx_ is not None:
x.extend(xx_)
y.extend(self.oracle(xx_))
try:
# cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2)
# grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1)
# grid.fit(x, y)
# h_best = grid.best_estimator_
raise ValueError
except ValueError:
h_best = svm.SVC(C=1e5)
h_best.fit(x, y)
# h_best.fit(x, y)
self.set_clf2(h_best)
return self.benchmark() # (ex.batch_predict, h_.predict, test_x, n_features)
def test_classifier(clf, dataset, feature_list, folds = 1000):
data = featureFormat(dataset, feature_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for train_idx, test_idx in cv:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)
for prediction, truth in zip(predictions, labels_test):
if prediction == 0 and truth == 0:
true_negatives += 1
elif prediction == 0 and truth == 1:
false_negatives += 1
elif prediction == 1 and truth == 0:
false_positives += 1
elif prediction == 1 and truth == 1:
true_positives += 1
else:
print "Warning: Found a predicted label not == 0 or 1."
print "All predictions should take value 0 or 1."
print "Evaluating performance for processed predictions:"
break
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
print clf
print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
print ""
except:
print "Got a divide by zero when trying out:", clf
print "Precision or recall may be undefined due to a lack of true positive predicitons."
def test_classifier(clf, dataset, feature_list, folds = 1000):
data = featureFormat(dataset, feature_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for train_idx, test_idx in cv:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)
for prediction, truth in zip(predictions, labels_test):
if prediction == 0 and truth == 0:
true_negatives += 1
elif prediction == 0 and truth == 1:
false_negatives += 1
elif prediction == 1 and truth == 0:
false_positives += 1
elif prediction == 1 and truth == 1:
true_positives += 1
else:
print "Warning: Found a predicted label not == 0 or 1."
print "All predictions should take value 0 or 1."
print "Evaluating performance for processed predictions:"
break
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
print clf
print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
print ""
except:
print "Got a divide by zero when trying out:", clf
print "Precision or recall may be undefined due to a lack of true positive predicitons."
def make_train_val():
print 'Loading Matlab data.'
f1 = scipy.io.loadmat('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_data/train_32x32.mat')
f2 = scipy.io.loadmat('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_data/extra_32x32.mat')
# name of your matlab variables:
data_train = f1.get('X')
labels_train = f1.get('y')
data_extra=f2.get('X')
labels_extra = f2.get('y')
sss = StratifiedShuffleSplit(labels_train, 3, test_size=0.05460229056, random_state=0)
for train_index, test_index in sss:
ind_train1=train_index
ind_val1=test_index
sss = StratifiedShuffleSplit(labels_extra, 3, test_size=0.00376554936, random_state=1)
for train_index, test_index in sss:
ind_train2=train_index
ind_val2=test_index
print 'val: '+str(len(ind_val1)+len(ind_val2))+' train: '+str(len(ind_train1)+len(ind_train2))
Y1= np.array(labels_train,dtype=int)
Y1[Y1==10]=0
Y1=Y1.flatten()
Y2= np.array(labels_extra,dtype=int)
Y2[Y2==10]=0
Y2=Y2.flatten()
X1= np.array(data_train)
X1=np.rollaxis(X1,3,0)
X2= np.array(data_extra)
X2=np.rollaxis(X2,3,0)
map_size_train = X2.nbytes*4
map_size_val = X1.nbytes*2
#if you want to shuffle your data
#random.shuffle(N)
env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_val', map_size=map_size_val)
with env.begin(write=True) as txn:
# txn is a Transaction object
for i in range(len(ind_val1)):
im_dat = caffe.io.array_to_datum(np.rollaxis(X1[ind_val1[i]],2,0),Y1[ind_val1[i]])
txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
for i in range(len(ind_val2)):
im_dat = caffe.io.array_to_datum(np.rollaxis(X2[ind_val2[i]],2,0),Y2[ind_val2[i]])
txn.put('{:0>10d}'.format(len(ind_val1)+i), im_dat.SerializeToString())
env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_train', map_size=map_size_train)
with env.begin(write=True) as txn:
# txn is a Transaction object
for i in range(len(ind_train1)):
im_dat = caffe.io.array_to_datum(np.rollaxis(X1[ind_train1[i]],2,0),Y1[ind_train1[i]])
txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
for i in range(len(ind_train2)):
im_dat = caffe.io.array_to_datum(np.rollaxis(X2[ind_train2[i]],2,0),Y2[ind_train2[i]])
txn.put('{:0>10d}'.format(len(ind_train1)+i), im_dat.SerializeToString())
def test_stratified_shuffle_split_even():
# Test the StratifiedShuffleSplit, indices are drawn with a
# equal chance
n_folds = 5
n_iter = 1000
def assert_counts_are_ok(idx_counts, p):
# Here we test that the distribution of the counts
# per index is close enough to a binomial
threshold = 0.05 / n_splits
bf = stats.binom(n_splits, p)
for count in idx_counts:
p = bf.pmf(count)
assert_true(p > threshold,
"An index is not drawn with chance corresponding "
"to even draws")
for n_samples in (6, 22):
labels = np.array((n_samples // 2) * [0, 1])
splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
test_size=1. / n_folds,
random_state=0)
train_counts = [0] * n_samples
test_counts = [0] * n_samples
n_splits = 0
for train, test in splits:
n_splits += 1
for counter, ids in [(train_counts, train), (test_counts, test)]:
for id in ids:
counter[id] += 1
assert_equal(n_splits, n_iter)
assert_equal(len(train), splits.n_train)
assert_equal(len(test), splits.n_test)
assert_equal(len(set(train).intersection(test)), 0)
label_counts = np.unique(labels)
assert_equal(splits.test_size, 1.0 / n_folds)
assert_equal(splits.n_train + splits.n_test, len(labels))
assert_equal(len(label_counts), 2)
ex_test_p = float(splits.n_test) / n_samples
ex_train_p = float(splits.n_train) / n_samples
assert_counts_are_ok(train_counts, ex_train_p)
assert_counts_are_ok(test_counts, ex_test_p)