python类StratifiedKFold()的实例源码-面圈网

calibration_utils.py 文件源码项目：introspective 作者: numeristical 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def train_and_calibrate_cv(model, X_tr, y_tr, cv=5):
    y_pred_xval = np.zeros(len(y_tr))
    skf = cross_validation.StratifiedKFold(y_tr, n_folds=cv,shuffle=True)
    i = 0;
    for train, test in skf:
        i = i+1
        print("training fold {} of {}".format(i, cv))
        X_train_xval = np.array(X_tr)[train,:]
        X_test_xval = np.array(X_tr)[test,:]
        y_train_xval = np.array(y_tr)[train]
        # We could also copy the model first and then fit it
        model_copy = clone(model)
        model_copy.fit(X_train_xval,y_train_xval)
        y_pred_xval[test]=model.predict_proba(X_test_xval)[:,1]
    print("training full model")
    model_copy = clone(model)
    model_copy.fit(X_tr,y_tr)
    print("calibrating function")
    calib_func = prob_calibration_function(y_tr, y_pred_xval)
    return model_copy, calib_func

analyze.py 文件源码项目：visually-grounded-speech 作者: gchrupala 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def test_homonym(H, sent, features, C=1.0):
    X_0 = features(matching(sent, H[0]))
    X_1 = features(matching(sent, H[1]))
    y_0 = numpy.zeros(len(X_0))
    y_1 = numpy.ones(len(X_1))
    X = normalize(numpy.vstack([X_0, X_1]), norm='l2')
    y = numpy.hstack([y_0, y_1])
    classifier = LogisticRegression(C=C)
    fold = StratifiedKFold(y, n_folds=10)
    score = []
    count = []
    for tr, te in fold:
        X_tr, X_te = X[tr], X[te]
        y_tr, y_te = y[tr], y[te]
        classifier.fit(X_tr, y_tr)
        score.append(sum(classifier.predict(X_te) == y_te))
        count.append(len(y_te))
    score = numpy.array(score, dtype='float')
    count = numpy.array(count, dtype='float')
    result = {'word1_count': len(y_0),
              'word2_count': len(y_1),
              'majority': 1.0 * max(len(y_0),len(y_1))/len(y),
              'kfold_acc': score/count }
    return result

Utilities.py 文件源码项目：a-cadmci 作者: florez87 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def getFolds(labels, number_folds):
        """
        Provides train/test indices to split data in train test sets.

        Parameters
        ----------
        labels: array-like of shape = [number_samples]
            The target values (class labels in classification).

        number_folds: int
            The amount of folds for the k-fold cross-validation.

        Return
        ----------
        folds: StratifiedKFold
            the train/test indices of the splitted data. 
        """
        return StratifiedKFold(y=labels, n_folds=number_folds, shuffle=True)

features.py 文件源码项目：AutoML-Challenge 作者: postech-mlg-exbrain 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.lda
        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        try:
            for train, test in kf:
                lda = sklearn.lda.LDA()

                if len(y.shape) == 1 or y.shape[1] == 1:
                    lda.fit(X[train], y[train])
                else:
                    lda = OneVsRestClassifier(lda)
                    lda.fit(X[train], y[train])

                predictions = lda.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
            return accuracy / 10
        except LinAlgError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
        except ValueError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN

features.py 文件源码项目：AutoML-Challenge 作者: postech-mlg-exbrain 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.naive_bayes

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            nb = sklearn.naive_bayes.GaussianNB()

            if len(y.shape) == 1 or y.shape[1] == 1:
                nb.fit(X[train], y[train])
            else:
                nb = OneVsRestClassifier(nb)
                nb.fit(X[train], y[train])

            predictions = nb.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10

features.py 文件源码项目：AutoML-Challenge 作者: postech-mlg-exbrain 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            random_state = check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(X[train], y[train])
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(X[train], y[train])

            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10

features.py 文件源码项目：AutoML-Challenge 作者: postech-mlg-exbrain 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            random_state = check_random_state(42)
            node = sklearn.tree.DecisionTreeClassifier(
                    criterion="entropy", max_depth=1, random_state=random_state,
                    min_samples_split=1, min_samples_leaf=1, max_features=None)
            if len(y.shape) == 1 or y.shape[1] == 1:
                node.fit(X[train], y[train])
            else:
                node = OneVsRestClassifier(node)
                node.fit(X[train], y[train])
            predictions = node.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10

features.py 文件源码项目：AutoML-Challenge 作者: postech-mlg-exbrain 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)
        accuracy = 0.

        for train, test in kf:
            random_state = check_random_state(42)
            node = sklearn.tree.DecisionTreeClassifier(
                    criterion="entropy", max_depth=1, random_state=random_state,
                    min_samples_split=1, min_samples_leaf=1, max_features=1)
            if len(y.shape) == 1 or y.shape[1] == 1:
                node.fit(X[train], y[train])
            else:
                node = OneVsRestClassifier(node)
                node.fit(X[train], y[train])
            predictions = node.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10

test.py 文件源码项目：stacked_generalization 作者: fukatani 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def test_stacked_classfier_extkfold(self):
        bclf = LogisticRegression(random_state=1)
        clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
                RidgeClassifier(random_state=1),
                ]
        sl = StackedClassifier(bclf,
                               clfs,
                               n_folds=3,
                               verbose=0,
                               Kfold=StratifiedKFold(self.iris.target, 3),
                               stack_by_proba=False,
                               oob_score_flag=True,
                               oob_metrics=log_loss)
        sl.fit(self.iris.data, self.iris.target)
        score = sl.score(self.iris.data, self.iris.target)
        self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))

test.py 文件源码项目：stacked_generalization 作者: fukatani 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def test_fwls_classfier(self):
        feature_func = lambda x: np.ones(x.shape)
        bclf = LogisticRegression(random_state=1)
        clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
                RidgeClassifier(random_state=1),
                ]
        sl = FWLSClassifier(bclf,
                            clfs,
                            feature_func=feature_func,
                            n_folds=3,
                            verbose=0,
                            Kfold=StratifiedKFold(self.iris.target, 3),
                            stack_by_proba=False)
        sl.fit(self.iris.data, self.iris.target)
        score = sl.score(self.iris.data, self.iris.target)
        self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))

class_w2v.py 文件源码项目：2016CCF_BDCI_Sougou 作者: coderSkyChen 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def validation(self,X,Y,kind):
        """

        ??2-fold????
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean()

class_w2v.py 文件源码项目：2016CCF-sougou 作者: prozhuchen 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def validation(self,X,Y,kind):
        """

        ??2-fold????
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean()

wangbase.py 文件源码项目：DiscourseSenser 作者: WladimirSidorenko 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def _devset_cv(self, a_y_train, a_n_dev, a_n_folds):
        """Generate train-test split from training and development data.

        Args:
          a_y_train (list[int]):
            list of training instances' tags
          a_n_dev (int):
            number of devset instances
          a_n_folds (int):
            number of folds

        Returrns:
          list[tuple]: list of training/testing folds

        """
        folds = []
        n_train = len(a_y_train)
        dev_ids = [n_train + i for i in xrange(a_n_dev)]
        # create stratified K-folds over the training data
        skf = StratifiedKFold(a_y_train, a_n_folds)
        for train_ids, test_ids in skf:
            folds.append((train_ids,
                          np.concatenate((test_ids, dev_ids))))
        return folds

sklearn_benchmark.py 文件源码项目：magic 作者: pan-webis-de 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def run(self, X_train, y_train, X_test, y_test, profiler):
        skf = StratifiedKFold(y_train, n_folds=self.n_folds,
                              shuffle=True, random_state=123)
        fold = 1
        for train_index, test_index in skf:
            X_train_fold, y_train_fold = [X_train[i] for i in train_index], [y_train[i] for i in train_index]
            X_test_fold, y_test_fold = [X_train[i] for i in test_index], [y_train[i] for i in test_index]
            logger.info('Training on {} instances!'.format(len(train_index)))
            profiler.train(X_train_fold, y_train_fold)
            logger.info('Testing on fold {} with {} instances'.format(
                fold, len(test_index)))
            y_pred_fold = profiler.predict(X_test_fold)
            print_accuracy(y_test_fold, y_pred_fold)
            fold = fold + 1
        if X_test:
            logger.info('Training on {} instances!'.format(len(X_train)))
            profiler.train(X_train, y_train)
            logger.info('Testing on {} instances!'.format(len(X_test)))
            y_pred = profiler.predict(X_test)
            print_confusion_matrix(y_test, y_pred)
            print_accuracy(y_test, y_pred)

model.py 文件源码项目：wende 作者: h404bi 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def test_model(self, n_folds=10):
        """ ?? `??K-??????Stratified K-folds cross-validating?`
            ???????
        """
        logging.debug("testing model with {}-folds CV".format(n_folds))
        model = self.init_model()
        X = self.data.data
        y = self.data.target

        cv = cross_validation.StratifiedKFold(y, n_folds=n_folds, random_state=42)

        t0 = time()
        y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=-1, cv=cv)
        t = time() - t0
        print("=" * 52)
        print("time cost: {}".format(t))
        print()
        print("confusion matrix\n", metrics.confusion_matrix(y, y_pred))
        print()
        print("\t\taccuracy: {}".format(metrics.accuracy_score(y, y_pred)))
        print()
        print("\t\tclassification report")
        print("-" * 52)
        print(metrics.classification_report(y, y_pred))

class_w2v.py 文件源码项目：2016_CCFsougou 作者: dhdsjy 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def validation(self,X,Y,kind):
        """

        ??2-fold????
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean()

actual.py 文件源码项目：AnswerClassify 作者: kenluck2001 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def crossValidation(clf, X, Y, num=None):
    '''
        num: can be number of trees or nearest neighbours
    '''
    scores = []
    cv = StratifiedKFold(Y, n_folds=5)
    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        clf.fit( X_train, y_train )
        scores.append(clf.score( X_test, y_test ))
    if num:
        print("Classifier: " + str (clf.__str__ )+ "\t Mean(scores)= " + str (np.mean(scores) ) + "\tStddev(scores)= " + str (np.std(scores))+ "\t Number of neighbours / trees= " + str (num) + "\n")
        logFile ("Classifier: " + str (clf.__str__ )+ "\t Mean(scores)= " + str (np.mean(scores) ) + "\tStddev(scores)= " + str (np.std(scores))+ "\t Number of neighbours / trees= " + str (num) + "\n")
    else:
        print("Classifier: " + str (clf.__str__ )+ "\t Mean(scores)= " + str (np.mean(scores) ) + "\tStddev(scores)= " + str (np.std(scores)) + "\n")
        logFile ("Classifier: " + str (clf.__str__ )+ "\t Mean(scores)= " + str (np.mean(scores) ) + "\tStddev(scores)= " + str (np.std(scores)) + "\n")

doc.py 文件源码项目：AnswerClassify 作者: kenluck2001 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def crossValidation(clf, X, Y, num=None):
    '''
        num: can be number of trees or nearest neighbours
    '''
    scores = []
    cv = StratifiedKFold(Y, n_folds=5)
    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        clf.fit( X_train, y_train )
        scores.append(clf.score( X_test, y_test ))
    if num:
        print("Classifier: " + str (clf.__str__ )+ "\t Mean(scores)= " + str (np.mean(scores) ) + "\tStddev(scores)= " + str (np.std(scores))+ "\t Number of neighbours / trees= " + str (num) + "\n")
        logFile ("Classifier: " + str (clf.__str__ )+ "\t Mean(scores)= " + str (np.mean(scores) ) + "\tStddev(scores)= " + str (np.std(scores))+ "\t Number of neighbours / trees= " + str (num) + "\n")
    else:
        print("Classifier: " + str (clf.__str__ )+ "\t Mean(scores)= " + str (np.mean(scores) ) + "\tStddev(scores)= " + str (np.std(scores)) + "\n")
        logFile ("Classifier: " + str (clf.__str__ )+ "\t Mean(scores)= " + str (np.mean(scores) ) + "\tStddev(scores)= " + str (np.std(scores)) + "\n")

class_w2v.py 文件源码项目：2016_CCFsougou2 作者: dhdsjy 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def validation(self,X,Y,kind):
        """

        ??2-fold????
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean()

tuning.py 文件源码项目：kaggle-Kobe-Bryant-Shot-Selection 作者: shiba24 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def score(self, params):
        print "Training with params : "
        print params
        N_boost_round=[]
        Score=[]
        skf = cross_validation.StratifiedKFold(self.train_y, n_folds=6, shuffle=True, random_state=25)
        for train, test in skf:
            X_Train, X_Test, y_Train, y_Test = self.train_X[train], self.train_X[test], self.train_y[train], self.train_y[test]
            dtrain = xgb.DMatrix(X_Train, label=y_Train)
            dvalid = xgb.DMatrix(X_Test, label=y_Test)
            watchlist = [(dtrain, 'train'),(dvalid, 'eval')]
            model = xgb.train(params, dtrain, num_boost_round=150, evals=watchlist, early_stopping_rounds=10)
            predictions = model.predict(dvalid)
            N = model.best_iteration
            N_boost_round.append(N)
            score = model.best_score
            Score.append(score)
        Average_best_num_boost_round = np.average(N_boost_round)
        Average_best_score = np.average(Score)
        print "\tAverage of best iteration {0}\n".format(Average_best_num_boost_round)
        print "\tScore {0}\n\n".format(Average_best_score)
        return {'loss': Average_best_score, 'status': STATUS_OK, 'Average_best_num_boost_round': Average_best_num_boost_round}

base.py 文件源码项目：stacking 作者: ikki407 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def create_cv_id(target, n_folds_ = 5, cv_id_name=cv_id_name, seed=407):
    try:
        a = StratifiedKFold(target['target'],n_folds=n_folds_, shuffle=True, random_state=seed)
        cv_index = a.test_folds
        print 'Done StratifiedKFold'
    except:
        cv_index = np.empty(len(target))
        a = KFold(len(target),n_folds=n_folds_, shuffle=True, random_state=seed)
        for idx, i in enumerate(a):
            cv_index[i[1]] = idx
        cv_index = cv_index.astype(int)
        print 'Done Kfold'

    np.save(INPUT_PATH + cv_id_name, cv_index)
    return 

######### Utils #########

#feature list????????????util??

test_cross_validation.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_stratified_kfold_no_shuffle():
    # Manually check that StratifiedKFold preserves the data ordering as much
    # as possible on toy datasets in order to avoid hiding sample dependencies
    # when possible
    splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 2])
    assert_array_equal(train, [1, 3])

    train, test = next(splits)
    assert_array_equal(test, [1, 3])
    assert_array_equal(train, [0, 2])

    splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 3, 4])
    assert_array_equal(train, [2, 5, 6])

    train, test = next(splits)
    assert_array_equal(test, [2, 5, 6])
    assert_array_equal(train, [0, 1, 3, 4])

test_cross_validation.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves label ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    labels = np.array([4] * int(0.10 * n_samples) +
                      [0] * int(0.89 * n_samples) +
                      [1] * int(0.01 * n_samples))
    for shuffle in [False, True]:
        for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
            assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
                                2)
            assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
                                2)
            assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
                                2)
            assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2)

test_cross_validation.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]

test_cross_validation.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]

learn.py 文件源码项目：extract 作者: dblalock 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def gridSearchPipeline(pipeline, paramsGrid, Xtrain, Ytrain, **cvParams):
    print("Grid Searching pipeline:")
    print(pipeline)

    # use 5-fold stratified cross-validation by default to maintain
    # consistent class balance across training and testing
    if 'cv' not in cvParams:
        # print "Ytrain: ", Ytrain
        # numClasses = len(np.unique(Ytrain))
        # examplesPerClass = len(Ytrain) / numClasses
        # nFolds = max(5, examplesPerClass / 5)
        # if nFolds < 5:
        # if True:
            # r, c = Ytrain.shape
            # print "tiny Ytrain size: (%d, %d)" % Ytrain.shape # (r, c)
            # for row in Ytrain: print row
        # cvParams['cv'] = StratifiedKFold(Ytrain, n_folds=nFolds)
        cvParams['cv'] = StratifiedKFold(Ytrain, n_folds=5)

    cv = GridSearchCV(pipeline, paramsGrid, **cvParams)
    cv.fit(Xtrain, Ytrain)
    return cv

test_slda.py 文件源码项目：slda 作者: Savvysherpa 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test_grtm():
    l = language(1000)
    n_iter = 1000
    KL_thresh = 0.3

    mu = 0.
    nu2 = 1.
    np.random.seed(l['seed'])
    H = np.random.normal(loc=mu, scale=nu2, size=(l['K'], l['K']))
    zeta = pd.DataFrame([(i, j, np.dot(np.dot(l['thetas'][i], H),
                                       l['thetas'][j]))
                         for i, j in product(range(l['D']), repeat=2)],
                        columns=('tail', 'head', 'zeta'))
    zeta['y'] = (zeta.zeta >= 0).astype(int)
    y = zeta[['tail', 'head', 'y']].values
    skf = StratifiedKFold(y[:, 2], n_folds=100)
    _, train_idx = next(iter(skf))
    _K = l['K']
    _alpha = l['alpha'][:_K]
    _beta = np.repeat(0.01, l['V'])
    _b = 1.
    grtm = GRTM(_K, _alpha, _beta, mu, nu2, _b, n_iter, seed=l['seed'],
                n_report_iter=l['n_report_iters'])
    grtm.fit(l['doc_term_matrix'], y[train_idx])

    assert_probablity_distribution(grtm.phi)
    check_KL_divergence(l['topics'], grtm.phi, KL_thresh)

test_slda.py 文件源码项目：slda 作者: Savvysherpa 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def test_grtm():
    l = language(1000)
    n_iter = 1000
    KL_thresh = 0.3

    mu = 0.
    nu2 = 1.
    np.random.seed(l['seed'])
    H = np.random.normal(loc=mu, scale=nu2, size=(l['K'], l['K']))
    zeta = pd.DataFrame([(i, j, np.dot(np.dot(l['thetas'][i], H),
                                       l['thetas'][j]))
                         for i, j in product(range(l['D']), repeat=2)],
                        columns=('tail', 'head', 'zeta'))
    zeta['y'] = (zeta.zeta >= 0).astype(int)
    y = zeta[['tail', 'head', 'y']].values
    skf = StratifiedKFold(y[:, 2], n_folds=100)
    _, train_idx = next(iter(skf))
    _K = l['K']
    _alpha = l['alpha'][:_K]
    _beta = np.repeat(0.01, l['V'])
    _b = 1.
    grtm = GRTM(_K, _alpha, _beta, mu, nu2, _b, n_iter, seed=l['seed'],
                n_report_iter=l['n_report_iters'])
    grtm.fit(l['doc_term_matrix'], y[train_idx])

    assert_probablity_distribution(grtm.phi)
    check_KL_divergence(l['topics'], grtm.phi, KL_thresh)

modelData.py 文件源码项目：rdocChallenge 作者: Elyne 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def grid_search(estimator, data, featTypes=('BoW',), nFolds=10, random_seed=44, param_grid=()):

    labels = [x.severity for x in data]

    generatePrimaryFeats(data, featTypes)

    featurized = []
    for d in data:
        instance = {}
        for featname, values in d.feats.items():
            # Give each feature a unique name to avoid overwriting features.
            # If e.g. a concept feature has the same name as a bow word, the old code
            # would overwrite one of the features.
            instance.update({"{0}-{1}".format(featname, k): v for k, v in values.items()})

        featurized.append(instance)

    d = DictVectorizer()
    x_train = d.fit_transform(featurized)

    folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=random_seed)
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring="f1", n_jobs=-1, cv=folds)
    fit_grid = grid.fit(x_train, labels)

    print(fit_grid.best_params_)
    return fit_grid.best_params_

train_model.py 文件源码项目：kaggle-right-whale 作者: felixlaumon 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True):
    if stratify:
        n_folds = int(round(1 / test_size))
        sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state)
    else:
        sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state)
    train_idx, test_idx = iter(sss).next()
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]