python类RandomizedSearchCV()的实例源码-面圈网

grid_search.py 文件源码项目：skutil 作者: tgsmith61591 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def fit(self, X, y=None, groups=None):
            """Run fit on the estimator with randomly drawn parameters.

            Parameters
            ----------

            X : array-like, shape=(n_samples, n_features)
                Training vector, where n_samples is the number of samples and
                n_features is the number of features.

            y : array-like, shape=(n_samples,) or (n_samples, n_output), optional (default=None)
                Target relative to X for classification or regression;
                None for unsupervised learning.

            groups : array-like, shape=(n_samples,), optional (default=None)
                Group labels for the samples used while splitting the dataset into
                train/test set.
            """
            return super(RandomizedSearchCV, self).fit(X, _as_numpy(y), groups)

models.py 文件源码项目：Bacchus 作者: surfstudio 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def fit(self, X, *args, **kwargs):
        if self._grid_search:
            model = GridSearchCV(self._model, **self._grid_search)
        elif self._random_search:
            model = RandomizedSearchCV(self._model, **self._random_search)
        else:
            model = self._model

        if self._grid_search is not None:
            self._grid = model
        elif self._random_search is not None:
            self._rnd = model

        assert (self.target in X.columns.values), 'X must contain the target column'
        self._xcols = list(X.columns.values)
        self._xcols.remove(self.target)
        if len(self._columns_exclude) == 0 and len(self._columns_include) > 0:
            self._columns_exclude = list(set(self._xcols) - set(self._columns_include))
        [self._xcols.remove(t) for t in self._columns_exclude]
        x = X[self._xcols]
        y = X[self.target]
        model.fit(x, y, **kwargs)
        return self

12.4 grid_search.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def test_RandomizedSearchCV():

    '''
    Use RandomizedSearchCV and LogisticRegression, to improve C, multi_class.
    :return:  None
    '''
    digits = load_digits()
    X_train,X_test,y_train,y_test=train_test_split(digits.data, digits.target,
                test_size=0.25,random_state=0,stratify=digits.target)

    tuned_parameters ={  'C': scipy.stats.expon(scale=100),
                        'multi_class': ['ovr','multinomial']}
    clf=RandomizedSearchCV(LogisticRegression(penalty='l2',solver='lbfgs',tol=1e-6),
                        tuned_parameters,cv=10,scoring="accuracy",n_iter=100)
    clf.fit(X_train,y_train)
    print("Best parameters set found:",clf.best_params_)
    print("Randomized Grid scores:")
    for params, mean_score, scores in clf.grid_scores_:
             print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params))

    print("Optimized Score:",clf.score(X_test,y_test))
    print("Detailed classification report:")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

test_big.py 文件源码项目：skutil 作者: tgsmith61591 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works

random_forest.py 文件源码项目：motif 作者: rabitt 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def fit(self, X, Y):
        """ Train classifier.

        Parameters
        ----------
        X : np.array [n_samples, n_features]
            Training features.
        Y : np.array [n_samples]
            Training labels

        """
        x_shuffle, y_shuffle = shuffle(X, Y, random_state=self.random_state)
        clf_cv = RFC(n_estimators=self.n_estimators, n_jobs=self.n_jobs,
                     class_weight=self.class_weight,
                     random_state=self.random_state)
        param_dist = {
            "max_depth": sp_randint(1, 101),
            "max_features": [None, 'auto', 'sqrt', 'log2'],
            "min_samples_split": sp_randint(2, 11),
            "min_samples_leaf": sp_randint(1, 11),
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"]
        }

        random_search = RandomizedSearchCV(
            clf_cv, param_distributions=param_dist, refit=True,
            n_iter=self.n_iter_search, scoring='f1_weighted',
            random_state=self.random_state
        )
        random_search.fit(x_shuffle, y_shuffle)
        self.clf = random_search.best_estimator_

grid_search.py 文件源码项目：MixtureOfExperts 作者: krishnakalyan3 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def svc_model(self, X, y):
        X, y = shuffle(X, y, random_state=1337)
        svc = SVC(kernel='rbf', cache_size=self.cache_size, verbose=True)
        clf = RandomizedSearchCV(svc, param_distributions=self.params, n_iter=self.iters, n_jobs=-1, verbose=self.verbose)
        model = clf.fit(X[0:self.sample_size], y[0:self.sample_size])
        logging.info('Grid Scores ' + str(model.best_params_))
        logging.info('Best Scores ' + str(model.best_score_))
        return model.best_estimator_

randomized_search.py 文件源码项目：healthcareai-py 作者: HealthCatalyst 项目源码文件源码阅读 72 收藏 0 点赞 0 评论 0

def get_algorithm(estimator,
                  scoring_metric,
                  hyperparameter_grid,
                  randomized_search,
                  number_iteration_samples=10,
                  **non_randomized_estimator_kwargs):
    """
    Given an estimator and various params, initialize an algorithm with optional randomized search.

    Args:
        estimator (sklearn.base.BaseEstimator): a scikit-learn estimator (for example: KNeighborsClassifier)
        scoring_metric (str): The scoring metric to optimized for if using random search. See
            http://scikit-learn.org/stable/modules/model_evaluation.html
        hyperparameter_grid (dict): An object containing key value pairs of the specific hyperparameter space to search
            through.
        randomized_search (bool): Whether the method should return a randomized search estimator (as opposed to a
            simple algorithm).
        number_iteration_samples (int): If performing randomized search, this is the number of samples that are run in 
            the hyperparameter space. Higher numbers will be slower, but end up with better results, since it is more
            likely that the true optimal hyperparameter is found.
        **non_randomized_estimator_kwargs: Keyword arguments that you can pass directly to the algorithm. Only used when
            radomized_search is False

    Returns:
        sklearn.base.BaseEstimator: a scikit learn algorithm ready to `.fit()`

    """
    if randomized_search:
        algorithm = RandomizedSearchCV(estimator=estimator(),
                                       scoring=scoring_metric,
                                       param_distributions=hyperparameter_grid,
                                       n_iter=number_iteration_samples,
                                       verbose=0,
                                       n_jobs=1)

    else:
        algorithm = estimator(**non_randomized_estimator_kwargs)

    return algorithm

test_search.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def test_trivial_grid_scores():
    # Test search over a "grid" with only one point.
    # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV.
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1]})
    grid_search.fit(X, y)
    assert_true(hasattr(grid_search, "grid_scores_"))

    random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1)
    random_search.fit(X, y)
    assert_true(hasattr(random_search, "grid_scores_"))

test_search.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def test_randomized_search_grid_scores():
    # Make a dataset with a lot of noise to get various kind of prediction
    # errors across CV folds and parameter settings
    X, y = make_classification(n_samples=200, n_features=100, n_informative=3,
                               random_state=0)

    # XXX: as of today (scipy 0.12) it's not possible to set the random seed
    # of scipy.stats distributions: the assertions in this test should thus
    # not depend on the randomization
    params = dict(C=expon(scale=10),
                  gamma=expon(scale=0.1))
    n_cv_iter = 3
    n_search_iter = 30
    search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter,
                                param_distributions=params, iid=False)
    search.fit(X, y)
    assert_equal(len(search.grid_scores_), n_search_iter)

    # Check consistency of the structure of each cv_score item
    for cv_score in search.grid_scores_:
        assert_equal(len(cv_score.cv_validation_scores), n_cv_iter)
        # Because we set iid to False, the mean_validation score is the
        # mean of the fold mean scores instead of the aggregate sample-wise
        # mean score
        assert_almost_equal(np.mean(cv_score.cv_validation_scores),
                            cv_score.mean_validation_score)
        assert_equal(list(sorted(cv_score.parameters.keys())),
                     list(sorted(params.keys())))

    # Check the consistency with the best_score_ and best_params_ attributes
    sorted_grid_scores = list(sorted(search.grid_scores_,
                              key=lambda x: x.mean_validation_score))
    best_score = sorted_grid_scores[-1].mean_validation_score
    assert_equal(search.best_score_, best_score)

    tied_best_params = [s.parameters for s in sorted_grid_scores
                        if s.mean_validation_score == best_score]
    assert_true(search.best_params_ in tied_best_params,
                "best_params_={0} is not part of the"
                " tied best models: {1}".format(
                    search.best_params_, tied_best_params))

test_search.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(random_state=0)

    estimators = [DecisionTreeRegressor(random_state=0),
                  DecisionTreeClassifier(random_state=0)]

    # Test with grid search cv
    for est in estimators:
        grid_search = GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        for parameters, _, cv_validation_scores in grid_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score,
                                    cv_validation_scores[i])

    # Test with a randomized search
    for est in estimators:
        random_search = RandomizedSearchCV(est, est_parameters,
                                           cv=cv, n_iter=3)
        random_search.fit(X, y)
        for parameters, _, cv_validation_scores in random_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score,
                                    cv_validation_scores[i])

tuner.py 文件源码项目：xgboost-tuner 作者: cwerner87 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def tune_xgb_params_randomized(estimator_cls,
                               label: np.ndarray,
                               metric_sklearn: str,
                               n_jobs: int,
                               params: dict,
                               strat_folds: StratifiedKFold,
                               train: np.ndarray,
                               n_iter: int = 20,
                               verbosity_level: int = 10,
                               **kwargs):
    """
    :param estimator_cls:
        The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
    :param label:
        An array-like containing the labels of the classification or regression problem.
    :param metric_sklearn:
        The evaluation metric to be passed to scikit-learn's GridSearchCV - see
        http://scikit-learn.org/stable/modules/model_evaluation.html
        for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
    :param n_jobs:
        The number of jobs to run simultaneously.
    :param params:
        A dictionary of XGB parameters.
    :param strat_folds:
        A StratifiedKFold object to cross validate the parameters.
    :param train:
        An array-like containing the training input samples.
    :param n_iter:
        An optional parameter to control the number of parameter settings that are sampled.
    :param n_jobs:
        An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available.
    :param verbosity_level:
        An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
    :param kwargs:
        Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for
        colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2.
    :return:
        A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
    """
    params_copy = clean_params_for_sk(params)
    param_distributions = {
        'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)),
        'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)),
        'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)),
        'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)),
        'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
        'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
        'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8))
    }

    rand_search = RandomizedSearchCV(
        cv=strat_folds.split(train, label),
        estimator=estimator_cls(**params_copy),
        n_iter=n_iter,
        n_jobs=n_jobs,
        param_distributions=param_distributions,
        scoring=metric_sklearn,
        verbose=verbosity_level
    )
    rand_search.fit(train, label)
    return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]

test_search.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def test_pickle():
    # Test that a fit search can be pickled
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True)
    grid_search.fit(X, y)
    pickle.dumps(grid_search)  # smoke test

    random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]},
                                       refit=True, n_iter=3)
    random_search.fit(X, y)
    pickle.dumps(random_search)  # smoke test