def test_w_prep_fit():
"""[Model Selection] Test run with preprocessing, single step."""
evl = Evaluator(mape_scorer, cv=5, shuffle=False, random_state=100,
verbose=True)
with open(os.devnull, 'w') as f, redirect_stdout(f):
evl.fit(X, y,
estimators=[OLS()],
param_dicts={'ols': {'offset': randint(1, 10)}},
preprocessing={'pr': [Scale()], 'no': []},
n_iter=3)
np.testing.assert_approx_equal(
evl.results['test_score-m']['no.ols'],
-24.903229451043195)
np.testing.assert_approx_equal(
evl.results['test_score-m']['pr.ols'],
-26.510708862278072, 1)
assert evl.results['params']['no.ols']['offset'] == 4
assert evl.results['params']['pr.ols']['offset'] == 4
python类randint()的实例源码
def get_uniform_paramgrid(hyperparameters, fixed_parameters):
param_grid = dict()
for param_name, hyperparameter in hyperparameters.items():
if fixed_parameters is not None and param_name in fixed_parameters.keys():
continue
if isinstance(hyperparameter, CategoricalHyperparameter):
all_values = hyperparameter.choices
if all(item in ['True', 'False'] for item in all_values):
all_values = [bool(item) for item in all_values]
param_grid[param_name] = all_values
elif isinstance(hyperparameter, UniformFloatHyperparameter):
if hyperparameter.log:
param_grid[param_name] = loguniform(base=2, low=hyperparameter.lower, high=hyperparameter.upper)
else:
param_grid[param_name] = uniform(loc=hyperparameter.lower, scale=hyperparameter.upper-hyperparameter.lower)
elif isinstance(hyperparameter, UniformIntegerHyperparameter):
if hyperparameter.log:
param_grid[param_name] = loguniform_int(base=2, low=hyperparameter.lower, high=hyperparameter.upper)
else:
param_grid[param_name] = randint(low=hyperparameter.lower, high=hyperparameter.upper+1)
else:
raise ValueError()
return param_grid
def test_large_grid():
"""In this test, we purposely overfit a RandomForest to completely random data
in order to assert that the test error will far supercede the train error.
"""
if not SK18:
custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
else:
custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)
# define the pipe
pipe = Pipeline([
('scaler', SelectiveScaler()),
('pca', SelectivePCA(weight=True)),
('rf', RandomForestClassifier(random_state=42))
])
# define hyper parameters
hp = {
'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
'pca__whiten': [True, False],
'pca__weight': [True, False],
'pca__n_components': uniform(0.75, 0.15),
'rf__n_estimators': randint(5, 10),
'rf__max_depth': randint(5, 15)
}
# define the grid
grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)
# this will fail because we haven't fit yet
assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)
# fit the grid
grid.fit(X_train, y_train)
# score for coverage -- this might warn...
with warnings.catch_warnings():
warnings.simplefilter("ignore")
grid.score(X_train, y_train)
# coverage:
assert grid._estimator_type == 'classifier'
# get predictions
tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)
# evaluate score (SHOULD be better than random...)
accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)
# grid score reports:
# assert fails for bad percentile
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})
# assert fails for bad y_axis
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})
# assert passes otherwise
report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def fit(self, X, Y):
""" Train classifier.
Parameters
----------
X : np.array [n_samples, n_features]
Training features.
Y : np.array [n_samples]
Training labels
"""
x_shuffle, y_shuffle = shuffle(X, Y, random_state=self.random_state)
clf_cv = RFC(n_estimators=self.n_estimators, n_jobs=self.n_jobs,
class_weight=self.class_weight,
random_state=self.random_state)
param_dist = {
"max_depth": sp_randint(1, 101),
"max_features": [None, 'auto', 'sqrt', 'log2'],
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]
}
random_search = RandomizedSearchCV(
clf_cv, param_distributions=param_dist, refit=True,
n_iter=self.n_iter_search, scoring='f1_weighted',
random_state=self.random_state
)
random_search.fit(x_shuffle, y_shuffle)
self.clf = random_search.best_estimator_
def test_params():
"""[Model Selection] Test raises on bad params."""
evl = Evaluator(mape_scorer, verbose=2)
np.testing.assert_raises(ValueError,
evl.fit, X, y,
estimators=[OLS()],
param_dicts={'bad.ols':
{'offset': randint(1, 10)}},
preprocessing={'prep': [Scale()]})
def test_raises():
"""[Model Selection] Test raises on error."""
evl = Evaluator(bad_scorer, verbose=1)
with open(os.devnull, 'w') as f, redirect_stdout(f):
np.testing.assert_raises(
ValueError, evl.fit, X, y, estimators=[OLS()],
param_dicts={'ols': {'offset': randint(1, 10)}}, n_iter=1)
def test_passes():
"""[Model Selection] Test sets error score on failed scoring."""
evl = Evaluator(bad_scorer, error_score=0, n_jobs=1, verbose=5)
with open(os.devnull, 'w') as f, redirect_stdout(f):
evl = np.testing.assert_warns(FitFailedWarning,
evl.fit, X, y,
estimators=[OLS()],
param_dicts={'ols':
{'offset': randint(1, 10)}},
n_iter=1)
assert evl.results['test_score-m']['ols'] == 0
def test_no_prep():
"""[Model Selection] Test run without preprocessing."""
evl = Evaluator(mape_scorer, cv=5, shuffle=False,
random_state=100, verbose=12)
with open(os.devnull, 'w') as f, redirect_stdout(f):
evl.fit(X, y,
estimators=[OLS()],
param_dicts={'ols': {'offset': randint(1, 10)}},
n_iter=3)
np.testing.assert_approx_equal(
evl.results['test_score-m']['ols'],
-24.903229451043195)
assert evl.results['params']['ols']['offset'] == 4
def test_w_prep_set_params():
"""[Model Selection] Test run with preprocessing, sep param dists."""
evl = Evaluator(mape_scorer, cv=5, shuffle=False, random_state=100,
verbose=2)
params = {'no.ols': {'offset': randint(3, 6)},
'pr.ols': {'offset': randint(1, 3)},
}
with open(os.devnull, 'w') as f, redirect_stdout(f):
evl.fit(X, y,
estimators={'pr': [OLS()], 'no': [OLS()]},
param_dicts=params,
preprocessing={'pr': [Scale()], 'no': []},
n_iter=10)
np.testing.assert_approx_equal(
evl.results['test_score-m']['no.ols'],
-18.684229451043198)
np.testing.assert_approx_equal(
evl.results['test_score-m']['pr.ols'],
-7.2594502123869491)
assert evl.results['params']['no.ols']['offset'] == 3
assert evl.results['params']['pr.ols']['offset'] == 1
def test_random_grid():
# build a pipeline
pipe = Pipeline([
('retainer', FeatureRetainer()), # will retain all
('dropper', FeatureDropper()), # won't drop any
('mapper', FunctionMapper()), # pass through
('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through
('collinearity', MulticollinearityFilterer(threshold=0.85)),
('imputer', SelectiveImputer()), # pass through
('scaler', SelectiveScaler()),
('boxcox', BoxCoxTransformer()),
('nzv', NearZeroVarianceFilterer(threshold=1e-4)),
('pca', SelectivePCA(n_components=0.9)),
('model', RandomForestClassifier(n_jobs=1))
])
# let's define a set of hyper-parameters over which to search
hp = {
'collinearity__threshold': uniform(loc=.8, scale=.15),
'collinearity__method': ['pearson', 'kendall', 'spearman'],
'scaler__scaler': [StandardScaler(), RobustScaler()],
'pca__n_components': uniform(loc=.75, scale=.2),
'pca__whiten': [True, False],
'model__n_estimators': randint(5, 10),
'model__max_depth': randint(2, 5),
'model__min_samples_leaf': randint(1, 5),
'model__max_features': uniform(loc=.5, scale=.5),
'model__max_leaf_nodes': randint(10, 15)
}
# define the gridsearch
search = RandomizedSearchCV(pipe, hp,
n_iter=2, # just to test it even works
scoring='accuracy',
cv=2,
random_state=42)
# fit the search
search.fit(X_train, y_train)
# test the report
report_grid_score_detail(search, charts=False)
def tune_xgb_params_randomized(estimator_cls,
label: np.ndarray,
metric_sklearn: str,
n_jobs: int,
params: dict,
strat_folds: StratifiedKFold,
train: np.ndarray,
n_iter: int = 20,
verbosity_level: int = 10,
**kwargs):
"""
:param estimator_cls:
The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
:param label:
An array-like containing the labels of the classification or regression problem.
:param metric_sklearn:
The evaluation metric to be passed to scikit-learn's GridSearchCV - see
http://scikit-learn.org/stable/modules/model_evaluation.html
for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
:param n_jobs:
The number of jobs to run simultaneously.
:param params:
A dictionary of XGB parameters.
:param strat_folds:
A StratifiedKFold object to cross validate the parameters.
:param train:
An array-like containing the training input samples.
:param n_iter:
An optional parameter to control the number of parameter settings that are sampled.
:param n_jobs:
An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available.
:param verbosity_level:
An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
:param kwargs:
Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for
colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2.
:return:
A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
"""
params_copy = clean_params_for_sk(params)
param_distributions = {
'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)),
'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)),
'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)),
'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)),
'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8))
}
rand_search = RandomizedSearchCV(
cv=strat_folds.split(train, label),
estimator=estimator_cls(**params_copy),
n_iter=n_iter,
n_jobs=n_jobs,
param_distributions=param_distributions,
scoring=metric_sklearn,
verbose=verbosity_level
)
rand_search.fit(train, label)
return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]