python类StratifiedShuffleSplit()的实例源码-面圈网

active_learning.py 文件源码项目：Steal-ML 作者: ftramer 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5)
    x, y = online.collect_pts(100, -1)
    i = 0
    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
        grid.fit(x, y)
        h_ = grid.best_estimator_

        online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1)
        x_, _ = online_.collect_pts(10, 200)
        if x_ is not None and len(x_) > 0:
            x.extend(x_)
            y.extend(oracle(x_))
        q += online_.get_n_query()
        pred_y = h_.predict(test_x)
        print len(x), q, sm.accuracy_score(test_y, pred_y)

RBFTrainer.py 文件源码项目：Steal-ML 作者: ftramer 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def grid_retrain_in_f(self, n_dim=500):
        rbf_map = RBFSampler(n_dim, random_state=1)
        fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map),
                                                ("svm", LinearSVC())])

        # C_range = np.logspace(-5, 15, 21, base=2)
        # gamma_range = np.logspace(-15, 3, 19, base=2)
        # param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range)
        # cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42)
        # grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv)
        # grid.fit(X, Y)
        #
        # rbf_svc2 = grid.best_estimator_

        rbf_svc2 = fourier_approx_svm
        rbf_svc2.fit(self.X_ex, self.y_ex)

        self.set_clf2(rbf_svc2)
        return self.benchmark()

PolyTrainer.py 文件源码项目：Steal-ML 作者: ftramer 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def grid_search(self):
        C_range = np.logspace(-5, 15, 21, base=2)
        param_grid = dict(C=C_range)
        cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(SVC(kernel='poly', max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0)

        logger.info('start grid search for Linear')
        grid.fit(self.X_ex, self.y_ex)
        logger.info('end grid search for Linear')

        scores = [x[1] for x in grid.grid_scores_]

        # final train
        clf = grid.best_estimator_

        pred_train = clf.predict(self.X_ex)
        pred_val = clf.predict(self.val_x)
        pred_test = clf.predict(self.test_x)

        r = Result(self.name + ' (X)', 'Poly', len(self.X_ex),
                   sm.accuracy_score(self.y_ex, pred_train),
                   sm.accuracy_score(self.val_y, pred_val),
                   sm.accuracy_score(self.test_y, pred_test))
        return r

test_cross_validation.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 90 收藏 0 点赞 0 评论 0

def test_stratified_shuffle_split_init():
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)

    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)

    # Train size or test size too small
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)

test_cross_validation.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50)
          ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                          random_state=0)
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1])
                       / float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1])
                      / float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(y[train].size + y[test].size, y.size)
            assert_array_equal(np.intersect1d(train, test), [])

data.py 文件源码项目：melanoma-transfer 作者: learningtitans 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def split_indices_old(files, labels, test_size=0.1, random_state=RANDOM_STATE):
    names = get_names(files)
    labels = get_labels(names, per_patient=True)
    spl = cross_validation.StratifiedShuffleSplit(labels[:, 0],
                                                  test_size=test_size,
                                                  random_state=random_state,
                                                  n_iter=1)
    tr, te = next(iter(spl))
    tr = np.hstack([tr * 2, tr * 2 + 1])
    te = np.hstack([te * 2, te * 2 + 1])
    return tr, te

data.py 文件源码项目：melanoma-transfer 作者: learningtitans 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def split_indices(files, labels, label_file, test_size=0.1, random_state=RANDOM_STATE):                             # <-- Necessary for running with training on melanoma database, not using per_patient
    names = get_names(files)
    labels = get_labels(names, label_file=label_file, per_patient=False)
    spl = cross_validation.StratifiedShuffleSplit(labels,
                                                  test_size=test_size,
                                                  random_state=random_state,
                                                  n_iter=1)
    tr, te = next(iter(spl))
    return tr, te

kernel_selection2.py 文件源码项目：DataMining 作者: lidalei 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def hot(X, y):


    C_range = np.logspace(-15, 15, 31,base = 2.0)
    gamma_range = np.logspace(-15, 15, 31, base = 2.0)

#     param_grid = dict(gamma=gamma_range, C=C_range)
#     cv = StratifiedShuffleSplit(y, n_iter=10, test_size=0.2, random_state=42)
    roc_auc_scorer = get_scorer("roc_auc")
    scores = []
    for C in C_range:
        for gamma in gamma_range:
            auc_scorer = []
            for train, test in KFold(n=len(X), n_folds=10, random_state=42):
                rbf_svc = svm.SVC(C=C, kernel='rbf', gamma=gamma, probability=True)
                X_train, y_train = X[train], y[train]
                X_test, y_test = X[test], y[test]
                rbf_clf = rbf_svc.fit(X_train, y_train)
                auc_scorer.append(roc_auc_scorer(rbf_clf, X_test, y_test))
            scores.append(np.mean(auc_scorer))
#     grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
#     grid.fit(X, y)
#     scores = [x[1] for x in grid.grid_scores_]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    print scores
    plt.figure(figsize=(15, 12))
    plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
               norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
    plt.xlabel('gamma')
    plt.ylabel('C')
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=90)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title('AUC')
    plt.show()

data_dirs_organizer.py 文件源码项目：painters 作者: inejc 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def _train_val_split_indices(labels):
    split = StratifiedShuffleSplit(
        labels, n_iter=1, test_size=VAL_SIZE, random_state=42)
    indices_tr, indices_val = next(iter(split))

    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=False)
    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=True)
    return indices_tr, indices_val, split.classes

evaluate_features.py 文件源码项目：wende 作者: h404bi 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def cross_predict(feat, f_name, X=X, y=y):

    if os.name == 'nt':
        n_jobs = 1
    else:
        n_jobs = -1
    # ????
    # clf_1 = MultinomialNB(alpha=5)
    clf_2 = LinearSVC(C=0.02)

    # ???? (CV)
    # This cross-validation object is a merge of StratifiedKFold and ShuffleSplit,
    # which returns stratified randomized folds. The folds are made by preserving
    # the percentage of samples for each class.
    #
    #  Note: like the ShuffleSplit strategy, stratified random splits do not guarantee
    # that all folds will be different, although this is still
    # very likely for sizeable datasets.
    #
    # Pass this cv to cross_val_predict will raise
    # ValueError:cross_val_predict only works for partitions
    #
    # ? cv ?????? fold ? fold ????????
    # cv = cross_validation.StratifiedShuffleSplit(y, test_size=0.2, random_state=42)

    # This cross-validation object is a variation of KFold that returns stratified folds.
    # The folds are made by preserving the percentage of samples for each class.
    cv = cross_validation.StratifiedKFold(y, n_folds=5, random_state=42)

    model = Pipeline([('feat', feat), ('clf', clf_2)])
    t0 = time()
    y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=n_jobs, cv=cv)
    t = time() - t0
    print("=" * 20, f_name, "=" * 20)
    print("time cost: {}".format(t))
    # print("y_predict: {}".format(y_pred))
    print()
    print('confusion matrix:\n', confusion_matrix(y, y_pred))
    print()
    print('\t\taccuracy: {}'.format(accuracy_score(y, y_pred)))
    print()
    print("\t\tclassification report")
    print("-" * 52)
    print(classification_report(y, y_pred))


# ??
# ???? (tfidf: baseline feature)

RBFSolver.py 文件源码项目：Steal-ML 作者: ftramer 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def do(self, n_pts):
        X, y = self.collect_pts(n_pts)

        print 'done collecting points'

        rbf_map = RBFSampler(n_components=n_pts, random_state=1)
        solver = HyperSolver(p=self.POS, n=self.NEG)
        rbf_solver = pipeline.Pipeline([("mapper", rbf_map),
                                        ("solver", solver)])

        gamma_range = np.logspace(-15, 6, 22, base=2)
        param_grid = dict(mapper__gamma=gamma_range)
        cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=1)
        grid = GridSearchCV(rbf_solver, param_grid=param_grid, cv=cv, n_jobs=8)
        grid.fit(X, y)

        scores = [x[1] for x in grid.grid_scores_]
        scores = np.array(scores).reshape(len(gamma_range))
        plt.figure(figsize=(8, 6))
        plt.plot(gamma_range, scores)

        plt.xlabel('gamma')
        plt.ylabel('score')
        plt.title('Validation accuracy (RTiX, %s)' % os.path.basename(self.name))
        plt.savefig(self.name + '-SLViF-grid-npts=%d.pdf' % n_pts)

        # final train
        g = grid.best_params_['mapper__gamma']
        print 'best parameters are g=%f' % g
        rbf_svc2 = grid.best_estimator_
        y_pred = rbf_svc2.predict(self.Xt)
        print 'SCORE: %f' % sm.accuracy_score(self.Yt, y_pred)
        return grid.best_score_, sm.accuracy_score(self.Yt, y_pred)

LinearTrainer.py 文件源码项目：Steal-ML 作者: ftramer 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def grid_search(self):
        C_range = np.logspace(-5, 15, 21, base=2)
        param_grid = dict(C=C_range)
        cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(LinearSVC(dual=False, max_iter=10000), param_grid=param_grid,
                            cv=cv,
                            n_jobs=1, verbose=0)

        logger.info('start grid search for Linear')
        grid.fit(self.X_ex, self.y_ex)
        logger.info('end grid search for Linear')

        scores = [x[1] for x in grid.grid_scores_]

        # final train
        rbf_svc2 = grid.best_estimator_

        pred_train = rbf_svc2.predict(self.X_ex)
        pred_val = rbf_svc2.predict(self.val_x)
        pred_test = rbf_svc2.predict(self.test_x)

        r = Result(self.name + ' (X)', 'Linear', len(self.X_ex),
                   sm.accuracy_score(self.y_ex, pred_train),
                   sm.accuracy_score(self.val_y, pred_val),
                   sm.accuracy_score(self.test_y, pred_test))
        return r

main.py 文件源码项目：supervised-hashing-baselines 作者: facebookresearch 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def balancedSplit(X, y, seed, test_sz=1000):
    stratSplit = StratifiedShuffleSplit(
        y, 1, test_size=test_sz, random_state=seed
    )
    for train_idx, test_idx in stratSplit:
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]
        break
    return X_train, y_train, X_test, y_test

main.py 文件源码项目：supervised-hashing-baselines 作者: facebookresearch 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def getBalancedSample(y, seed, test_sz=1000):
    if y.shape[0] == test_sz:
        return np.arange(test_sz)
    else:
        stratSplit = StratifiedShuffleSplit(
            y, 1, test_size=test_sz, random_state=seed
        )
        for _, test_idx in stratSplit:
            idx = test_idx
            break
        return idx

make_lmdb.py 文件源码项目：hyperband_benchmarks 作者: lishal 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def get_data():
    lmdb_env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_train_lmdb//')
    lmdb_txn = lmdb_env.begin()
    lmdb_cursor = lmdb_txn.cursor()
    datum = caffe.proto.caffe_pb2.Datum()

    x=[]
    y=[]
    for key, value in lmdb_cursor:
        datum.ParseFromString(value)
        label = datum.label
        data = caffe.io.datum_to_array(datum)
        x.append(data)
        y.append(label)

    x=np.array(x)
    y=np.array(y)



    map_size = int(1e12)
    # Don't need to shuffle 3 times
    sss = StratifiedShuffleSplit(y, 3, test_size=0.2, random_state=0)
    for train_index, test_index in sss:
        ind_train=train_index
        ind_test=test_index
    env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_evenval_lmdb2/', map_size=map_size)
    with env.begin(write=True) as txn:
        # txn is a Transaction object
        for i in range(10000):
            im_dat = caffe.io.array_to_datum(x[ind_test][i],y[ind_test][i])
            txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())

    #map_size = x.nbytes * 10
    del env
    env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_eventrain_lmdb2/', map_size=map_size)
    with env.begin(write=True) as txn:
        # txn is a Transaction object
        for i in range(40000):
            im_dat = caffe.io.array_to_datum(x[ind_train][i],y[ind_train][i])
            txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())

mrbi_to_lmdb.py 文件源码项目：hyperband_benchmarks 作者: lishal 项目源码文件源码阅读 47 收藏 0 点赞 0 评论 0

def make_train_val():
    print 'Loading Matlab data.'
    f =  '/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mnist_rotation_back_image_new/mnist_all_background_images_rotation_normalized_train_valid.amat'

    X,Y=get_data(f)
    N = Y.shape[0]
    map_size = X.nbytes*2
    #if you want to shuffle your data
    #random.shuffle(N)

    sss = StratifiedShuffleSplit(Y, 3, test_size=2000, random_state=0)
    for train_index, test_index in sss:
        ind_train1=train_index
        ind_val1=test_index
    print len(ind_train1),len(ind_val1)
    env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mrbi_train', map_size=map_size*5/6)
    with env.begin(write=True) as txn:
        # txn is a Transaction object
        for i in range(len(ind_train1)):
            im_dat = caffe.io.array_to_datum(X[ind_train1[i]],Y[ind_train1[i]])
            txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
    env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mrbi_val', map_size=map_size/6)
    with env.begin(write=True) as txn:
        # txn is a Transaction object
        for i in range(len(ind_val1)):
            im_dat = caffe.io.array_to_datum(X[ind_val1[i]],Y[ind_val1[i]])
            txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())

data_split.py 文件源码项目：lstm_word2vec 作者: rantsandruse 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def train_test_split_shuffle(target, features, test_size = 0.1):
    sss = StratifiedShuffleSplit(target, 1, test_size = test_size, random_state=0)
    for train_index, test_index in sss:
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = target[train_index], target[test_index]
        y_test = y_test.values
        y_train = y_train.values

    return X_train, y_train, X_test, y_test

test_cross_validation.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    labels = [0, 1, 2, 3] * 3 + [4, 5] * 5

    splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
                                         test_size=0.5, random_state=0)
    train, test = next(iter(splits))

    assert_array_equal(np.intersect1d(train, test), [])

tester.py 文件源码项目：P5-Identify-Fraud-from-Enron-Email 作者: georgekst 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

active_learning.py 文件源码项目：Steal-ML 作者: ftramer 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5)

    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)

    x, y = online.collect_pts(100, -1)

    i = 0

    cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
    grid.fit(x, y)
    h_ = grid.best_estimator_
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1)
        x_ = online_.collect_one_pair()
        if x_ is not None and len(x_) > 0:
            for _x in x_:
                x.append(_x)
                y.append(1)
                cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
                grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
                grid.fit(x, y)
                h1 = grid.best_estimator_
                s1 = sm.accuracy_score(y, h1.predict(x))

                y[-1] = -1
                cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
                grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
                grid.fit(x, y)
                h2 = grid.best_estimator_
                s2 = sm.accuracy_score(y, h2.predict(x))
                if s1 >= .99 and s2 >= .99:
                    print 'branch 1'
                    y[-1] = oracle(x_)[0]
                elif s1 >= .99 and s2 < .99:
                    print 'branch 2'
                    y[-1] = 1
                elif s1 < .99 and s2 >= .99:
                    print 'branch 3'
                    y[-1] = -1
                else:
                    print 'branch 4: ', s1, s2
                    del x[-1]
                    del y[-1]
                    continue

            if y[-1] == 1:
                h_ = h1
            else:
                h_ = h2

        q += online_.get_n_query()
        pred_y = h_.predict(test_x)
        print q, sm.accuracy_score(test_y, pred_y)

active_learning.py 文件源码项目：Steal-ML 作者: ftramer 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def do(self):
        # get some initial points
        self.ex.collect_up_to_budget(self.budget_per_round * 2)
        x, y = self.ex.pts_near_b, self.ex.pts_near_b_labels

        if len(np.unique(y)) < 2:
            return 1, 1

        # gamma_range = np.logspace(-5, 1, 10, base=10)
        # param_grid = dict(gamma=gamma_range)

        try:
            # cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2)
            # grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1)
            # grid.fit(x, y)
            # h_best = grid.best_estimator_
            raise ValueError
        except ValueError:
            h_best = svm.SVC(C=1e5)
            h_best.fit(x, y)

        for i in range(1, self.n_rounds - 1):
            online_ = OnlineBase('', +1, self.NEG, h_best.predict, self.n_features, 'uniform', error=.1)
            x_, _ = online_.collect_pts(self.budget_per_round, 50000)  # budget doesn't matter

            xx_ = None
            if x_ is None or len(x_) < self.budget_per_round:
                print('Run out of budget when getting x_')
                xx_ = np.random.uniform(-1, 1, (self.budget_per_round - len(x_), self.n_features))

            if x_ is not None and len(x_) > 0:
                x.extend(x_)
                y.extend(self.oracle(x_))

            if xx_ is not None:
                x.extend(xx_)
                y.extend(self.oracle(xx_))

            try:
                # cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2)
                # grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1)
                # grid.fit(x, y)
                # h_best = grid.best_estimator_
                raise ValueError
            except ValueError:
                h_best = svm.SVC(C=1e5)
                h_best.fit(x, y)

            # h_best.fit(x, y)

        self.set_clf2(h_best)
        return self.benchmark() # (ex.batch_predict, h_.predict, test_x, n_features)

tester.py 文件源码项目：machine-learning 作者: cinserra 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

tester.py 文件源码项目：uda-da-p5-enron-fraud-detection 作者: watanabe8760 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

mat_to_lmdb.py 文件源码项目：hyperband_benchmarks 作者: lishal 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def make_train_val():
    print 'Loading Matlab data.'
    f1 = scipy.io.loadmat('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_data/train_32x32.mat')
    f2 = scipy.io.loadmat('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_data/extra_32x32.mat')
    # name of your matlab variables:
    data_train = f1.get('X')
    labels_train = f1.get('y')
    data_extra=f2.get('X')
    labels_extra = f2.get('y')
    sss = StratifiedShuffleSplit(labels_train, 3, test_size=0.05460229056, random_state=0)
    for train_index, test_index in sss:
        ind_train1=train_index
        ind_val1=test_index
    sss = StratifiedShuffleSplit(labels_extra, 3, test_size=0.00376554936, random_state=1)
    for train_index, test_index in sss:
        ind_train2=train_index
        ind_val2=test_index
    print 'val: '+str(len(ind_val1)+len(ind_val2))+' train: '+str(len(ind_train1)+len(ind_train2))
    Y1= np.array(labels_train,dtype=int)
    Y1[Y1==10]=0
    Y1=Y1.flatten()
    Y2= np.array(labels_extra,dtype=int)
    Y2[Y2==10]=0
    Y2=Y2.flatten()

    X1= np.array(data_train)
    X1=np.rollaxis(X1,3,0)

    X2= np.array(data_extra)
    X2=np.rollaxis(X2,3,0)
    map_size_train = X2.nbytes*4
    map_size_val = X1.nbytes*2
    #if you want to shuffle your data
    #random.shuffle(N)
    env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_val', map_size=map_size_val)
    with env.begin(write=True) as txn:
        # txn is a Transaction object
        for i in range(len(ind_val1)):
            im_dat = caffe.io.array_to_datum(np.rollaxis(X1[ind_val1[i]],2,0),Y1[ind_val1[i]])
            txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
        for i in range(len(ind_val2)):
            im_dat = caffe.io.array_to_datum(np.rollaxis(X2[ind_val2[i]],2,0),Y2[ind_val2[i]])
            txn.put('{:0>10d}'.format(len(ind_val1)+i), im_dat.SerializeToString())
    env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_train', map_size=map_size_train)
    with env.begin(write=True) as txn:
        # txn is a Transaction object
        for i in range(len(ind_train1)):
            im_dat = caffe.io.array_to_datum(np.rollaxis(X1[ind_train1[i]],2,0),Y1[ind_train1[i]])
            txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
        for i in range(len(ind_train2)):
            im_dat = caffe.io.array_to_datum(np.rollaxis(X2[ind_train2[i]],2,0),Y2[ind_train2[i]])
            txn.put('{:0>10d}'.format(len(ind_train1)+i), im_dat.SerializeToString())

test_cross_validation.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
                                             test_size=1. / n_folds,
                                             random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits:
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        assert_equal(len(train), splits.n_train)
        assert_equal(len(test), splits.n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(splits.n_train + splits.n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(splits.n_test) / n_samples
        ex_train_p = float(splits.n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p)