python类StratifiedKFold()的实例源码-面圈网

calibration_utils.py 文件源码项目：introspective 作者: numeristical 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def train_and_calibrate_cv(model, X_tr, y_tr, cv=5):
    y_pred_xval = np.zeros(len(y_tr))
    skf = cross_validation.StratifiedKFold(y_tr, n_folds=cv,shuffle=True)
    i = 0;
    for train, test in skf:
        i = i+1
        print("training fold {} of {}".format(i, cv))
        X_train_xval = np.array(X_tr)[train,:]
        X_test_xval = np.array(X_tr)[test,:]
        y_train_xval = np.array(y_tr)[train]
        # We could also copy the model first and then fit it
        model_copy = clone(model)
        model_copy.fit(X_train_xval,y_train_xval)
        y_pred_xval[test]=model.predict_proba(X_test_xval)[:,1]
    print("training full model")
    model_copy = clone(model)
    model_copy.fit(X_tr,y_tr)
    print("calibrating function")
    calib_func = prob_calibration_function(y_tr, y_pred_xval)
    return model_copy, calib_func

predict_2017_06_16_3.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def rf1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 300
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    for n, (itrain, ival) in enumerate(skf.split(train2, y)):
        print('step %d of %d'%(n+1, skf.n_splits), now())
        clf = ensemble.RandomForestRegressor(n_estimators=1000,
                                             max_depth=3,
                                             random_state=13)
        clf.fit(train2[itrain], y[itrain])

        p = clf.predict(train2[ival])
        v.loc[ival, cname] += p
        score = metrics.log_loss(y[ival], p)
        z[cname]  += np.log1p(clf.predict(test2))
        print(cname, 'step %d: score'%(n+1), score, now())
        scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits

personal_db.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def get_split(self):
        if self.split is not None:
            return
        name = "{}/split.p".format(self.flags.data_path)
        split = load_pickle(None,name,[])

        if len(split) == 0:
            #data = self.data["training_variants"].append(self.data["test_variants_filter"])
            data = self.data["training_variants"]
            y = data['Class']-1
            X = np.arange(y.shape[0])
            from sklearn.model_selection import StratifiedKFold
            skf = StratifiedKFold(n_splits=self.flags.folds,shuffle=True,random_state=99)
            split = [(train_index, test_index) for train_index, test_index in skf.split(X, y)]
            save_pickle(split,name)
            print("new shuffle")
        self.split = split
        #print("split va",split[0][1][:10])

inbreast.py 文件源码项目：deep-mil-for-whole-mammogram-classification 作者: wentaozhu 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def cvsplit(fold, totalfold, mydict):
  '''get the split of train and test
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation
  mydict is the return dict from readlabel'''
  skf = StratifiedKFold(n_splits=totalfold)  # default shuffle is false, okay!
  #readdicom(mydict)
  y = mydict.values()
  x = mydict.keys()
  count = 0
  for train, test in skf.split(x,y):
    print(len(train), len(test))
    if count == fold:
      #print test
      return train, test
    count += 1

task.py 文件源码项目：stacker 作者: bamine 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42):
        self.name = name
        self.X = X
        self.y = y
        self.task = task
        self.random_state = random_state
        if test_size is not None:
            self.test_size = test_size
            self.validation_method = "train_test_split"
            self.X_train, self.X_test, self.y_train, self.y_test = \
                model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
        elif cv is not None:
            self.validation_method = "cv"
            if task == "regression":
                self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state)
            elif task == "classification":
                self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)

mvpa_voxelselector.py 文件源码项目：brainiak 作者: brainiak 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def _sfn(l, mask, myrad, bcast_var):
    """Score classifier on searchlight data using cross-validation.

    The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The
    number of cross-validation folds is in `bast_var[1].
    """
    clf = bcast_var[2]
    data = l[0][mask, :].T
    # print(l[0].shape, mask.shape, data.shape)
    skf = model_selection.StratifiedKFold(n_splits=bcast_var[1],
                                          shuffle=False)
    accuracy = np.mean(model_selection.cross_val_score(clf, data,
                                                       y=bcast_var[0],
                                                       cv=skf,
                                                       n_jobs=1))
    return accuracy

classification.py 文件源码项目：brainiak 作者: brainiak 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj):
    # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel
    # when the kernel matrix is computed in portions; also, this method only works
    # for self-correlation, i.e. correlation between the same data matrix.

    # no shrinking, set C=1
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1)
    #logit_clf = LogisticRegression()
    clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
    # doing leave-one-subject-out cross validation
    # no shuffling in cv
    skf = model_selection.StratifiedKFold(n_splits=num_subjects,
                                          shuffle=False)
    scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)),
                                             y=labels,
                                             cv=skf)
    print(scores)
    logger.info(
        'the overall cross validation accuracy is %.2f' %
        np.mean(scores)
    )

Classifier.py 文件源码项目：SecuML 作者: ANSSI-FR 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def setBestParameters(self):
        cv = StratifiedKFold(n_splits = self.conf.num_folds)
        param_grid = self.conf.getParamGrid()
        if param_grid is None:
            # No parameter value to select
            return
        if self.conf.families_supervision:
            scoring = 'f1_macro'
        else:
            scoring = 'roc_auc'
        grid_search = GridSearchCV(self.pipeline, param_grid = param_grid,
                scoring = scoring,
                cv = cv,
                n_jobs = -1,
                fit_params = {'model__sample_weight': self.datasets.sample_weight})
        grid_search.fit(self.datasets.train_instances.getFeatures(),
                self.getSupervision(self.datasets.train_instances))
        self.conf.setBestValues(grid_search)
        self.pipeline.set_params(**self.conf.getBestValues())
        return cv

tests.py 文件源码项目：scikit-mdr 作者: EpistasisLab 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0.

tests.py 文件源码项目：scikit-mdr 作者: EpistasisLab 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def test_mdr_sklearn_pipeline_parallel():
    """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
    assert np.mean(cv_scores) > 0.

dataset.py 文件源码项目：heamy 作者: rushter 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
        """K-Folds cross validation iterator.

        Parameters
        ----------
        k : int, default 5
        stratify : bool, default False
        shuffle : bool, default True
        seed : int, default 33

        Yields
        -------
        X_train, y_train, X_test, y_test, train_index, test_index
        """
        if stratify:
            kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
        else:
            kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)

        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
            X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
            yield X_train, y_train, X_test, y_test, train_index, test_index

nGramClassifier.py 文件源码项目：flexmatcher 作者: biggorilla-gh 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def predict_training(self, folds=5):
        """Do cross-validation and return probabilities for each data-point.

        Args:
            folds (int): Number of folds used for prediction on training data.
        """
        partial_clf = linear_model.LogisticRegression(class_weight='balanced')
        prediction = np.zeros((len(self.features), self.num_classes))
        skf = StratifiedKFold(n_splits=folds)
        for train_index, test_index in skf.split(self.features, self.labels):
            # prepare the training and test data
            training_features = self.features[train_index]
            test_features = self.features[test_index]
            training_labels = self.labels[train_index]
            # fitting the model and predicting
            partial_clf.fit(training_features, training_labels)
            curr_pred = partial_clf.predict_proba(test_features)
            prediction[test_index] = \
                self.predict_proba_ordered(curr_pred, partial_clf.classes_)
        return prediction

charDistClassifier.py 文件源码项目：flexmatcher 作者: biggorilla-gh 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def predict_training(self, folds=5):
        """Do cross-validation and return probabilities for each data-point.

        Args:
            folds (int): Number of folds used for prediction on training data.
        """
        partial_clf = linear_model.LogisticRegression(class_weight='balanced')
        prediction = np.zeros((len(self.features), self.num_classes))
        skf = StratifiedKFold(n_splits=folds)
        for train_index, test_index in skf.split(self.features, self.labels):
            # prepare the training and test data
            training_features = self.features[train_index]
            test_features = self.features[test_index]
            training_labels = self.labels[train_index]
            # fitting the model and predicting
            partial_clf.fit(training_features, training_labels)
            curr_pred = partial_clf.predict_proba(test_features)
            prediction[test_index] = \
                self.predict_proba_ordered(curr_pred, partial_clf.classes_)
        return prediction

knnClassifier.py 文件源码项目：flexmatcher 作者: biggorilla-gh 项目源码文件源码阅读 102 收藏 0 点赞 0 评论 0

def predict_training(self, folds=5):
        """Do cross-validation and return probabilities for each data-point.

        Args:
            folds (int): Number of folds used for prediction on training data.
        """
        prediction = np.zeros((len(self.strings), self.num_classes))
        skf = StratifiedKFold(n_splits=folds)
        for train_index, test_index in skf.split(self.strings, self.labels):
            # prepare the training and test data
            training_strings = self.strings[train_index]
            test_strings = self.strings[test_index]
            training_labels = self.labels[train_index]
            # predicting the results
            part_prediction = self.find_knn(training_strings, training_labels,
                                            test_strings)
            prediction[test_index] = part_prediction
        return prediction

predict_2017_06_28_1.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())

predict_2017_07_01_4.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())

predict_2017_06_30_1.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())

predict_2017_06_27_2.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())

predict_2017_07_01_3.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())

predict_2017_07_01_2.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())

predict_2017_06_28_2.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())

test_split.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves class ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    X = np.ones(n_samples)
    y = np.array([4] * int(0.10 * n_samples) +
                 [0] * int(0.89 * n_samples) +
                 [1] * int(0.01 * n_samples))

    for shuffle in (False, True):
        for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y):
            assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2)
            assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2)
            assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2)
            assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2)

test_split.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def test_stratifiedkfold_balance():
    # Check that KFold returns folds with balanced sizes (only when
    # stratification is possible)
    # Repeat with shuffling turned off and on
    X = np.ones(17)
    y = [0] * 3 + [1] * 14

    for shuffle in (True, False):
        cv = StratifiedKFold(3, shuffle=shuffle)
        for i in range(11, 17):
            skf = cv.split(X[:i], y[:i])
            sizes = []
            for _, test in skf:
                sizes.append(len(test))

            assert_true((np.max(sizes) - np.min(sizes)) <= 1)
            assert_equal(np.sum(sizes), i)

matrix.py 文件源码项目：hidi 作者: VEVO 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def transform(self, M,  **kwargs):
        """
        Takes a Takes a dataframe that has :code:`item_id` index, other
        'features' columns for prediction, and applies a Keras sequential
        model to it.

        :param M:
            a dataframe that has an :code:`item_id` index, and
            "features" columns.

        :type M: pandas.DataFrame
        :rtype: a tuple with trained Keras model and its keyword
            arguments
        """
        rows, columns = M.shape
        factors = M.merge(self.validation_matrix, left_index=True,
                          right_index=True)
        factors = factors.values

        if self.classification:
            kfold = StratifiedKFold(n_splits=self.kfold_n_splits,
                                    random_state=self.kfold_seed,
                                    shuffle=self.kfold_shuffle)
        else:
            kfold = KFold(n_splits=self.kfold_n_splits,
                          random_state=self.kfold_seed,
                          shuffle=self.kfold_shuffle)

        X = factors[:, :columns]
        Y = factors[:, columns:]
        for train_index, test_index in kfold.split(X, Y):
            self.keras_model.fit(
                X[train_index], Y[train_index],
                validation_data=[X[test_index], Y[train_index]],
                **self.keras_kwargs)

        return self.keras_model, kwargs

inbreast.py 文件源码项目：deep-mil-for-whole-mammogram-classification 作者: wentaozhu 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def cvsplitenhance(fold, totalfold, mydict, valfold=-1):
  '''get the split of train and test
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation
  mydict is the return dict from readlabel
  sperate the data into train, validation, test'''
  skf = StratifiedKFold(n_splits=totalfold)  # default shuffle is false, okay!
  #readdicom(mydict)
  y = mydict.values()
  x = mydict.keys()
  count = 0
  if valfold == -1: 
    valfold = (fold+1) % totalfold
  print('valfold'+str(valfold))
  trainls, valls, testls = [], [], []
  for train, test in skf.split(x,y):
    print(len(train), len(test))
    if count == fold:
      #print test[:]
      testls = test[:]
    elif count == valfold:
      valls = test[:]
    else:
      for i in test:
        trainls.append(i)
    count += 1
  return trainls, valls, testls

run_model.py 文件源码项目：website-fingerprinting 作者: AxelGoetz 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def k_fold_validation(model, monitored_data, unmonitored_data, k, random_state=123):
    """
    Performs k fold validation on a model. During each fold, records all of the scoring in the `scoring_methods` module.

    @param model is a machine learning model that has the functions `fit(X, y)` and `predict(X)`
    @param monitored_data an array-like matrix that has the following structure `[(features, value)]`
    @param unmonitored_data is also an array-like object: [features]
    @param k is the amount of folds

    @return is a 2D array of scores, with the following structure `[{scoring_method: score}]` where the shape is `len(k)`
    """
    X, y = get_X_y(monitored_data, unmonitored_data)
    skf = StratifiedKFold(n_splits=k, random_state=random_state, shuffle=True)

    evaluations = []
    i = 1
    for train, test in skf.split(X, y):
        print("Starting split {}".format(i))
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]

        print("Fitting data")
        model.fit(X_train, y_train)

        print("Predicting")
        prediction = model.predict(X_test)

        evaluations.append(scoring_methods.evaluate_model(prediction, y_test))

        print(evaluations[-1])

        i += 1

    return evaluations

voxelselector.py 文件源码项目：brainiak 作者: brainiak 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def _cross_validation_for_one_voxel(clf, vid, num_folds, subject_data, labels):
    """Score classifier on data using cross validation."""
    # no shuffling in cv
    skf = model_selection.StratifiedKFold(n_splits=num_folds,
                                          shuffle=False)
    scores = model_selection.cross_val_score(clf, subject_data,
                                             y=labels,
                                             cv=skf, n_jobs=1)
    logger.debug(
        'cross validation for voxel %d is done' %
        vid
    )
    return (vid, scores.mean())

build_model.py 文件源码项目：MLClass 作者: bm2-lab 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def split_kfold_c(y):
    skf = StratifiedKFold(5)
    ilst = []
    for tri, tei in skf.split(np.zeros(len(y)), y):
        ilst.append((tri, tei))
    return ilst

methods.py 文件源码项目：pydl 作者: rafaeltg 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def get_cv_method(method, **kwargs):

    if method == 'kfold':
        return KFold(**kwargs)
    elif method == 'skfold':
        return StratifiedKFold(**kwargs)
    elif method == 'loo':
        return LeaveOneOut()
    elif method == 'shuffle_split':
        return ShuffleSplit(**kwargs)
    elif method == 'split':
        return TrainTestSplit(**kwargs)
    elif method == 's_shuffle_split':
        return StratifiedShuffleSplit(**kwargs)
    elif method == 'time_series':
        return TimeSeriesSplit(**kwargs)
    else:
        raise AttributeError('Invalid CV method - %s!' % method)

FacilitatorAgent.py 文件源码项目：DistributedClassifier 作者: rsboos 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def computeAccuracyForSingleModel(self,algorithm="SVM",isLocalSmall=0,execType="normal"):
        totalFeatures = self.instancesFeatures.shape[1]
        n = min(5, totalFeatures/2) # as explained in the article, the number of local agents will be 5
        numberOfFeaturesInEachModel = int( math.ceil (totalFeatures / n) )
        if (isLocalSmall):
            instFeatures = dataPreparation.selectNRandomColumns(self.instancesFeatures,numberOfFeaturesInEachModel)
            #select random numberOfFeatures columns
        else:
            instFeatures = np.array(self.instancesFeatures)

        skf = StratifiedKFold(n_splits=self.kFolds)
        avgScore = 0
        avgF1Macro = 0
        avgF1Micro = 0
        avgF1Weighted = 0
        for train_index, test_index in skf.split(instFeatures, self.instancesClasses):
            resultClasses = classifier.MakeClassification(self.algorithmsIndex[algorithm],instFeatures[train_index],self.instancesClasses[train_index],instFeatures[test_index],"value")
            valF1Macro = f1_score(self.instancesClasses[test_index], resultClasses, average='macro')
            valF1Micro = f1_score(self.instancesClasses[test_index], resultClasses, average='micro')
            valF1Weighted = f1_score(self.instancesClasses[test_index], resultClasses, average='weighted')
            valScore = accuracy_score(self.instancesClasses[test_index],resultClasses)
            avgF1Macro += valF1Macro
            avgF1Micro += valF1Micro
            avgF1Weighted += valF1Weighted
            avgScore += valScore
            with open(self.fileToWrite, "a") as myfile:
                myfile.write(str(valF1Weighted)+"\t"+str(valF1Micro)+"\t"+str(valF1Macro)+"\t"+str(valScore)+"\n")
        avgScore = avgScore / self.kFolds
        avgF1Macro /= self.kFolds
        avgF1Weighted /= self.kFolds
        avgF1Micro /= self.kFolds
        return avgScore, avgF1Macro, avgF1Micro, avgF1Weighted

    # this function will call all the underlying methods in order to perform data prepation, classification in each simulated agent, and aggregation