def fit(self, X, y=None, groups=None):
"""Run fit on the estimator with randomly drawn parameters.
Parameters
----------
X : array-like, shape=(n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape=(n_samples,) or (n_samples, n_output), optional (default=None)
Target relative to X for classification or regression;
None for unsupervised learning.
groups : array-like, shape=(n_samples,), optional (default=None)
Group labels for the samples used while splitting the dataset into
train/test set.
"""
return super(RandomizedSearchCV, self).fit(X, _as_numpy(y), groups)
python类RandomizedSearchCV()的实例源码
def fit(self, X, *args, **kwargs):
if self._grid_search:
model = GridSearchCV(self._model, **self._grid_search)
elif self._random_search:
model = RandomizedSearchCV(self._model, **self._random_search)
else:
model = self._model
if self._grid_search is not None:
self._grid = model
elif self._random_search is not None:
self._rnd = model
assert (self.target in X.columns.values), 'X must contain the target column'
self._xcols = list(X.columns.values)
self._xcols.remove(self.target)
if len(self._columns_exclude) == 0 and len(self._columns_include) > 0:
self._columns_exclude = list(set(self._xcols) - set(self._columns_include))
[self._xcols.remove(t) for t in self._columns_exclude]
x = X[self._xcols]
y = X[self.target]
model.fit(x, y, **kwargs)
return self
def test_RandomizedSearchCV():
'''
Use RandomizedSearchCV and LogisticRegression, to improve C, multi_class.
:return: None
'''
digits = load_digits()
X_train,X_test,y_train,y_test=train_test_split(digits.data, digits.target,
test_size=0.25,random_state=0,stratify=digits.target)
tuned_parameters ={ 'C': scipy.stats.expon(scale=100),
'multi_class': ['ovr','multinomial']}
clf=RandomizedSearchCV(LogisticRegression(penalty='l2',solver='lbfgs',tol=1e-6),
tuned_parameters,cv=10,scoring="accuracy",n_iter=100)
clf.fit(X_train,y_train)
print("Best parameters set found:",clf.best_params_)
print("Randomized Grid scores:")
for params, mean_score, scores in clf.grid_scores_:
print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params))
print("Optimized Score:",clf.score(X_test,y_test))
print("Detailed classification report:")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
def test_large_grid():
"""In this test, we purposely overfit a RandomForest to completely random data
in order to assert that the test error will far supercede the train error.
"""
if not SK18:
custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
else:
custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)
# define the pipe
pipe = Pipeline([
('scaler', SelectiveScaler()),
('pca', SelectivePCA(weight=True)),
('rf', RandomForestClassifier(random_state=42))
])
# define hyper parameters
hp = {
'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
'pca__whiten': [True, False],
'pca__weight': [True, False],
'pca__n_components': uniform(0.75, 0.15),
'rf__n_estimators': randint(5, 10),
'rf__max_depth': randint(5, 15)
}
# define the grid
grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)
# this will fail because we haven't fit yet
assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)
# fit the grid
grid.fit(X_train, y_train)
# score for coverage -- this might warn...
with warnings.catch_warnings():
warnings.simplefilter("ignore")
grid.score(X_train, y_train)
# coverage:
assert grid._estimator_type == 'classifier'
# get predictions
tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)
# evaluate score (SHOULD be better than random...)
accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)
# grid score reports:
# assert fails for bad percentile
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})
# assert fails for bad y_axis
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})
# assert passes otherwise
report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def fit(self, X, Y):
""" Train classifier.
Parameters
----------
X : np.array [n_samples, n_features]
Training features.
Y : np.array [n_samples]
Training labels
"""
x_shuffle, y_shuffle = shuffle(X, Y, random_state=self.random_state)
clf_cv = RFC(n_estimators=self.n_estimators, n_jobs=self.n_jobs,
class_weight=self.class_weight,
random_state=self.random_state)
param_dist = {
"max_depth": sp_randint(1, 101),
"max_features": [None, 'auto', 'sqrt', 'log2'],
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]
}
random_search = RandomizedSearchCV(
clf_cv, param_distributions=param_dist, refit=True,
n_iter=self.n_iter_search, scoring='f1_weighted',
random_state=self.random_state
)
random_search.fit(x_shuffle, y_shuffle)
self.clf = random_search.best_estimator_
def svc_model(self, X, y):
X, y = shuffle(X, y, random_state=1337)
svc = SVC(kernel='rbf', cache_size=self.cache_size, verbose=True)
clf = RandomizedSearchCV(svc, param_distributions=self.params, n_iter=self.iters, n_jobs=-1, verbose=self.verbose)
model = clf.fit(X[0:self.sample_size], y[0:self.sample_size])
logging.info('Grid Scores ' + str(model.best_params_))
logging.info('Best Scores ' + str(model.best_score_))
return model.best_estimator_
def get_algorithm(estimator,
scoring_metric,
hyperparameter_grid,
randomized_search,
number_iteration_samples=10,
**non_randomized_estimator_kwargs):
"""
Given an estimator and various params, initialize an algorithm with optional randomized search.
Args:
estimator (sklearn.base.BaseEstimator): a scikit-learn estimator (for example: KNeighborsClassifier)
scoring_metric (str): The scoring metric to optimized for if using random search. See
http://scikit-learn.org/stable/modules/model_evaluation.html
hyperparameter_grid (dict): An object containing key value pairs of the specific hyperparameter space to search
through.
randomized_search (bool): Whether the method should return a randomized search estimator (as opposed to a
simple algorithm).
number_iteration_samples (int): If performing randomized search, this is the number of samples that are run in
the hyperparameter space. Higher numbers will be slower, but end up with better results, since it is more
likely that the true optimal hyperparameter is found.
**non_randomized_estimator_kwargs: Keyword arguments that you can pass directly to the algorithm. Only used when
radomized_search is False
Returns:
sklearn.base.BaseEstimator: a scikit learn algorithm ready to `.fit()`
"""
if randomized_search:
algorithm = RandomizedSearchCV(estimator=estimator(),
scoring=scoring_metric,
param_distributions=hyperparameter_grid,
n_iter=number_iteration_samples,
verbose=0,
n_jobs=1)
else:
algorithm = estimator(**non_randomized_estimator_kwargs)
return algorithm
def test_trivial_grid_scores():
# Test search over a "grid" with only one point.
# Non-regression test: grid_scores_ wouldn't be set by GridSearchCV.
clf = MockClassifier()
grid_search = GridSearchCV(clf, {'foo_param': [1]})
grid_search.fit(X, y)
assert_true(hasattr(grid_search, "grid_scores_"))
random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1)
random_search.fit(X, y)
assert_true(hasattr(random_search, "grid_scores_"))
def test_randomized_search_grid_scores():
# Make a dataset with a lot of noise to get various kind of prediction
# errors across CV folds and parameter settings
X, y = make_classification(n_samples=200, n_features=100, n_informative=3,
random_state=0)
# XXX: as of today (scipy 0.12) it's not possible to set the random seed
# of scipy.stats distributions: the assertions in this test should thus
# not depend on the randomization
params = dict(C=expon(scale=10),
gamma=expon(scale=0.1))
n_cv_iter = 3
n_search_iter = 30
search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter,
param_distributions=params, iid=False)
search.fit(X, y)
assert_equal(len(search.grid_scores_), n_search_iter)
# Check consistency of the structure of each cv_score item
for cv_score in search.grid_scores_:
assert_equal(len(cv_score.cv_validation_scores), n_cv_iter)
# Because we set iid to False, the mean_validation score is the
# mean of the fold mean scores instead of the aggregate sample-wise
# mean score
assert_almost_equal(np.mean(cv_score.cv_validation_scores),
cv_score.mean_validation_score)
assert_equal(list(sorted(cv_score.parameters.keys())),
list(sorted(params.keys())))
# Check the consistency with the best_score_ and best_params_ attributes
sorted_grid_scores = list(sorted(search.grid_scores_,
key=lambda x: x.mean_validation_score))
best_score = sorted_grid_scores[-1].mean_validation_score
assert_equal(search.best_score_, best_score)
tied_best_params = [s.parameters for s in sorted_grid_scores
if s.mean_validation_score == best_score]
assert_true(search.best_params_ in tied_best_params,
"best_params_={0} is not part of the"
" tied best models: {1}".format(
search.best_params_, tied_best_params))
def test_grid_search_with_multioutput_data():
# Test search with multi-output estimator
X, y = make_multilabel_classification(return_indicator=True,
random_state=0)
est_parameters = {"max_depth": [1, 2, 3, 4]}
cv = KFold(random_state=0)
estimators = [DecisionTreeRegressor(random_state=0),
DecisionTreeClassifier(random_state=0)]
# Test with grid search cv
for est in estimators:
grid_search = GridSearchCV(est, est_parameters, cv=cv)
grid_search.fit(X, y)
for parameters, _, cv_validation_scores in grid_search.grid_scores_:
est.set_params(**parameters)
for i, (train, test) in enumerate(cv.split(X, y)):
est.fit(X[train], y[train])
correct_score = est.score(X[test], y[test])
assert_almost_equal(correct_score,
cv_validation_scores[i])
# Test with a randomized search
for est in estimators:
random_search = RandomizedSearchCV(est, est_parameters,
cv=cv, n_iter=3)
random_search.fit(X, y)
for parameters, _, cv_validation_scores in random_search.grid_scores_:
est.set_params(**parameters)
for i, (train, test) in enumerate(cv.split(X, y)):
est.fit(X[train], y[train])
correct_score = est.score(X[test], y[test])
assert_almost_equal(correct_score,
cv_validation_scores[i])
def tune_xgb_params_randomized(estimator_cls,
label: np.ndarray,
metric_sklearn: str,
n_jobs: int,
params: dict,
strat_folds: StratifiedKFold,
train: np.ndarray,
n_iter: int = 20,
verbosity_level: int = 10,
**kwargs):
"""
:param estimator_cls:
The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
:param label:
An array-like containing the labels of the classification or regression problem.
:param metric_sklearn:
The evaluation metric to be passed to scikit-learn's GridSearchCV - see
http://scikit-learn.org/stable/modules/model_evaluation.html
for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
:param n_jobs:
The number of jobs to run simultaneously.
:param params:
A dictionary of XGB parameters.
:param strat_folds:
A StratifiedKFold object to cross validate the parameters.
:param train:
An array-like containing the training input samples.
:param n_iter:
An optional parameter to control the number of parameter settings that are sampled.
:param n_jobs:
An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available.
:param verbosity_level:
An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
:param kwargs:
Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for
colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2.
:return:
A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
"""
params_copy = clean_params_for_sk(params)
param_distributions = {
'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)),
'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)),
'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)),
'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)),
'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8))
}
rand_search = RandomizedSearchCV(
cv=strat_folds.split(train, label),
estimator=estimator_cls(**params_copy),
n_iter=n_iter,
n_jobs=n_jobs,
param_distributions=param_distributions,
scoring=metric_sklearn,
verbose=verbosity_level
)
rand_search.fit(train, label)
return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]
def test_pickle():
# Test that a fit search can be pickled
clf = MockClassifier()
grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True)
grid_search.fit(X, y)
pickle.dumps(grid_search) # smoke test
random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]},
refit=True, n_iter=3)
random_search.fit(X, y)
pickle.dumps(random_search) # smoke test