python类KFold()的实例源码

kgrid.py 文件源码 项目:jamespy_py3 作者: jskDr 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
kgrid.py 文件源码 项目:jamespy_py3 作者: jskDr 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, X_concat)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
kgrid.py 文件源码 项目:jamespy_py3 作者: jskDr 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    kf_n = kf5_ext_c.split( xM)
    yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        kutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
kgrid.py 文件源码 项目:jamespy_py3 作者: jskDr 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    n_splits = xM.shape[0]

    # print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits)
    yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        kutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
jgrid.py 文件源码 项目:jamespy_py3 作者: jskDr 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def gs_Lasso(xM, yV, alphas_log=(-1, 1, 9), n_folds=5, n_jobs=-1):

    print(xM.shape, yV.shape)

    clf = linear_model.Lasso()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    parmas = {'alpha': np.logspace(*alphas_log)}
    kf5_c = model_selection.KFold(n_folds=n_folds, shuffle=True)
    kf5 = kf5_c.split(xM)

    gs = model_selection.GridSearchCV(
        clf, parmas, scoring='r2', cv=kf5, n_jobs=n_jobs)

    gs.fit(xM, yV)

    return gs
env.py 文件源码 项目:bnn-analysis 作者: myshkov 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def create_training_test_sets(self):
        """ Split data set into training and test folds. """
        # load input data
        input_data = np.asarray(np.loadtxt('input/data.txt'), dtype=np.float32)
        self.input_dim = input_data.shape[1] - 1
        self.output_dim = 1

        # align to batch size
        batches = input_data.shape[0] // (self.batch_size * self.n_splits)
        input_data = input_data[:batches * (self.batch_size * self.n_splits)]

        self.data_size = input_data.shape[0]
        print(f'Loaded input data, shape = {input_data.shape}')

        # create splits
        kfold = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
        print(f'Splits: {self.n_splits}')

        # assume y is in the last column by default
        for idx_train, idx_test in kfold.split(input_data):
            self.train_x.append(input_data[idx_train, :-1])
            self.train_y.append(input_data[idx_train, -1:])
            self.test_x.append(input_data[idx_test, :-1])
            self.test_y.append(input_data[idx_test, -1:])

        # layers described as [number of neurons, dropout probability]
        if self.layers_description is None:
            self.layers_description = [[self.input_dim, 0.0], [100, 0.0], [100, 0.0], [self.output_dim, 0.0]]
matrix.py 文件源码 项目:hidi 作者: VEVO 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def transform(self, M,  **kwargs):
        """
        Takes a Takes a dataframe that has :code:`item_id` index, other
        'features' columns for prediction, and applies a Keras sequential
        model to it.

        :param M:
            a dataframe that has an :code:`item_id` index, and
            "features" columns.

        :type M: pandas.DataFrame
        :rtype: a tuple with trained Keras model and its keyword
            arguments
        """
        rows, columns = M.shape
        factors = M.merge(self.validation_matrix, left_index=True,
                          right_index=True)
        factors = factors.values

        if self.classification:
            kfold = StratifiedKFold(n_splits=self.kfold_n_splits,
                                    random_state=self.kfold_seed,
                                    shuffle=self.kfold_shuffle)
        else:
            kfold = KFold(n_splits=self.kfold_n_splits,
                          random_state=self.kfold_seed,
                          shuffle=self.kfold_shuffle)

        X = factors[:, :columns]
        Y = factors[:, columns:]
        for train_index, test_index in kfold.split(X, Y):
            self.keras_model.fit(
                X[train_index], Y[train_index],
                validation_data=[X[test_index], Y[train_index]],
                **self.keras_kwargs)

        return self.keras_model, kwargs
agents.py 文件源码 项目:deeppavlov 作者: deepmipt 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def setup_data(self, path):
        """Read and iteratively yield data to agent"""
        print('loading: ' + path)

        questions = []
        y = []

        # open data file with labels
        # (path will be provided to setup_data from opt['datafile'] defined above)
        with open(path) as labels_file:
            context = csv.reader(labels_file)
            next(context)

            for item in context:
                label, text = item
                questions.append(text)
                y.append([self.answer_candidates[int(label)]])

        episode_done = True

        indexes = range(len(questions))
        if self.datatype_strict != 'test':
            random_state = random.getstate()
            random.setstate(self.random_state)
            kf_seed = random.randrange(500000)
            kf = KFold(self.opt.get('bagging_folds_number'), shuffle=True,
                       random_state=kf_seed)
            i = 0
            for train_index, test_index in kf.split(questions):
                indexes = train_index if self.datatype_strict == 'train' else test_index
                if i >= self.opt.get('bagging_fold_index', 0):
                    break
            self.random_state = random.getstate()
            random.setstate(random_state)

        # define iterator over all queries
        for i in indexes:
            # get current label, both as a digit and as a text
            # yield tuple with information and episode_done? flag
            yield (questions[i], y[i]), episode_done
_ridge_regression.py 文件源码 项目:palladio 作者: slipguru 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def kf_worker(X_tr, Y_tr, mu_range, tr_idx, vld_idx, i, results):
    """Worker for parallel KFold implementation."""
    betas = RLS_path(X_tr, Y_tr, mu_range)
    results[i] = {'betas': betas, 'tr_idx': tr_idx, 'vld_idx': vld_idx}
test_big.py 文件源码 项目:skutil 作者: tgsmith61591 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works
dmonscilearnclassification.py 文件源码 项目:dmon-adp 作者: igabriel85 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def adaBoost(self, settings, data=None, dropna=True):
        df = self.__loadData(data, dropna)
        features = df.columns[:-1]
        X = df[features]
        y = df.iloc[:, -1].values
        seed = 7
        num_trees = 500
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        print kfold
        model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
        results = model_selection.cross_val_score(model, X, y, cv=kfold)
        model.fit(X, y)
        print results.mean()
        print model.score(X, y)
        return True
exp_linear.py 文件源码 项目:marseille 作者: vene 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio):

    fn = cache_fname("linear_val_df", (dataset, k, link_alpha, prop_alpha,
                                       l1_ratio))

    if os.path.exists(fn):
        logging.info("Loading {}".format(fn))
        with open(fn, "rb") as f:
            return dill.load(f)

    ds = 'erule' if dataset == 'cdcp' else 'ukp-essays'  # sorry
    path = os.path.join("data", "process", ds, "folds", "{}", "{}")

    # sorry again: get val docs
    n_folds = 5 if dataset == 'ukp' else 3
    load, ids = get_dataset_loader(dataset, "train")
    for k_, (_, val) in enumerate(KFold(n_folds).split(ids)):
        if k_ == k:
            break
    val_docs = list(load(ids[val]))

    X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'),
                                    return_y=True)
    X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'),
                                    return_y=True)

    X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'),
                                    return_y=True)
    X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'),
                                    return_y=True)

    baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio)
    baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop)

    Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs)

    with open(fn, "wb") as f:
        logging.info("Saving {}".format(fn))
        dill.dump((Y_marg, baseline), f)

    return Y_marg, baseline
exp_linear.py 文件源码 项目:marseille 作者: vene 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def linear_cv_score(dataset, alpha, l1_ratio, constraints):

    fn = cache_fname("linear_cv_score", (dataset, alpha, l1_ratio,
                                         constraints))
    if os.path.exists(fn):
        logging.info("Loading {}".format(fn))
        with open(fn, "rb") as f:
            return dill.load(f)

    load, ids = get_dataset_loader(dataset, split="train")
    n_folds = 5 if dataset == 'ukp' else 3

    scores = []
    for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
        Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio)

        val_docs = list(load(ids[val]))
        Y_true = [doc.label for doc in val_docs]
        Y_pred = bl.fast_decode(Y_marg, val_docs, constraints)

        scores.append(bl._score(Y_true, Y_pred))

    with open(fn, "wb") as f:
        logging.info("Saving {}".format(fn))
        dill.dump(scores, f)
    return scores
exp_svmstruct.py 文件源码 项目:marseille 作者: vene 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def svmstruct_cv_score(dataset, C, class_weight, constraints,
                       compat_features, second_order_features):

    fn = cache_fname("svmstruct_cv_score", (dataset, C, class_weight,
                                            constraints, compat_features,
                                            second_order_features))

    if os.path.exists(fn):
        logging.info("Cached file already exists.")
        with open(fn, "rb") as f:
            return dill.load(f)

    load, ids = get_dataset_loader(dataset, split="train")

    n_folds = 5 if dataset == 'ukp' else 3

    # below are boolean logical ops
    grandparents = second_order_features and dataset == 'ukp'
    coparents = second_order_features
    siblings = second_order_features and dataset == 'cdcp'

    scores = []
    all_Y_pred = []

    for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
        train_docs = list(load(ids[tr]))
        val_docs = list(load(ids[val]))

        clf, Y_val, Y_pred = fit_predict(train_docs, val_docs, dataset, C,
                                         class_weight,
                                         constraints, compat_features,
                                         second_order_features, grandparents,
                                         coparents, siblings)
        all_Y_pred.extend(Y_pred)
        scores.append(clf.model._score(Y_val, Y_pred))

    with open(fn, "wb") as f:
        dill.dump((scores, all_Y_pred), f)

    return scores, all_Y_pred
build_model.py 文件源码 项目:MLClass 作者: bm2-lab 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def split_kfold_r(y):
    skf = KFold(5)
    ilst = []
    for tri, tei in skf.split(y):
        ilst.append((tri, tei))
    return ilst
splitutils.py 文件源码 项目:Y8M 作者: mpekalski 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def split_fold(in_pattern, rettrain=True, fold=0, cvs=5, include_vlaidation=True, split_seed=0):
    """
    Splits the elements of the in_pattern into training and test sets
    :param in_pattern: string of tfrecord patterns
    :param rettrain: return training set (True) or leave out set (False)
    :param fold: which fold to process
    :param cvs: how many folds you want
    :param include_vlaidation: include validation set
    :return: subset of tfrecords
    """
    assert fold < cvs

    files = gfile.Glob(in_pattern)
    if split_seed > 0:
        kf = KFold(n_splits=cvs, shuffle=True, random_state=split_seed)
    else:
        kf = KFold(n_splits=cvs)

    for i, (train, test) in enumerate(kf.split(files)):
        if i == fold:
            break

    if rettrain:
        retfiles = list(np.array(files)[train])
    else:
        retfiles = list(np.array(files)[test])

    if include_vlaidation:
        addition = [fname.replace('train', 'validate') for fname in retfiles]
        retfiles += addition

    return retfiles
facenet.py 文件源码 项目:faceNet_RealTime 作者: jack55436001 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):

        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])

    tpr = np.mean(tprs,0)
    fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy
facenet.py 文件源码 项目:faceNet_RealTime 作者: jack55436001 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):

        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0

        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])

    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
train_and_test.py 文件源码 项目:Guess-Genre-By-Lyrics 作者: ormatt 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def test_using_kfold(X, y, clf, splits=5):
    kf = KFold(n_splits=splits, shuffle=True)

    scores = []
    for k, (train, test) in enumerate(kf.split(X, y)):
        logger.info("Fitting and transforming the model on one fold")
        clf.fit(X[train], y[train])
        score = clf.score(X[test], y[test])
        logger.info("[Fold {0}] score: {1:.5f}".format(k+1, score))
        scores.append(score)

    utils.persistence.dump(CLF_KFOLD_DUMP_NAME, clf)
    scores_mean = np.mean(scores)
    logger.info("Score: {}".format(scores_mean))
    return clf
test_utils.py 文件源码 项目:tf_face 作者: ZhijianChan 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, num_folds=10):
    """Calculate TPR and FPR under different threshold, accuracy under the best threshold"""
    assert (embeddings1.shape[0] == embeddings2.shape[0])
    assert (embeddings1.shape[1] == embeddings2.shape[1])
    num_pairs = min(len(actual_issame), embeddings1.shape[0])
    num_threshold = len(thresholds)
    k_fold = KFold(n_splits=num_folds, shuffle=False)

    tprs = np.zeros((num_folds, num_threshold))
    fprs = np.zeros((num_folds, num_threshold))
    acc = np.zeros((num_folds))

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff), 1)
    indices = np.arange(num_pairs)

    for fold_id, (train_set, test_set) in enumerate(k_fold.split(indices)):
        # Find the best threshold
        acc_train = np.zeros((num_threshold))
        for thres_id, thres in enumerate(thresholds):
            _, _, acc_train[thres_id] = calculate_acc(thres, dist[train_set], actual_issame[train_set])
        best_id = np.argmax(acc_train)
        # Calculate tprs and fprs on test set
        for thres_id, thres in enumerate(thresholds):
            tprs[fold_id, thres_id], fprs[fold_id, thres_id], _ = calculate_acc(thres, dist[test_set],
                                                                                actual_issame[test_set])
        # Use the best threshold to calculate accuracy
        _, _, acc[fold_id] = calculate_acc(thresholds[best_id], dist[test_set], actual_issame[test_set])

    tpr = np.mean(tprs, 0)  # true  positive rate under different threshold
    fpr = np.mean(fprs, 0)  # false positive rate under different threshold
    return tpr, fpr, acc


问题


面经


文章

微信
公众号

扫码关注公众号