def test_regression():
# Check regression for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"max_features": [0.5, 1.0],
"bootstrap": [True, False],
"bootstrap_features": [True, False]})
for base_estimator in [None,
DummyRegressor(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
SVR()]:
for params in grid:
BaggingRegressor(base_estimator=base_estimator,
random_state=rng,
**params).fit(X_train, y_train).predict(X_test)
python类ParameterGrid()的实例源码
def run(self, grid_config):
for classpath, parameter_config in grid_config.items():
try:
module_name, class_name = classpath.rsplit(".", 1)
module = importlib.import_module(module_name)
cls = getattr(module, class_name)
for parameters in ParameterGrid(parameter_config):
try:
cls(**parameters)
except Exception as e:
raise ValueError(dedent('''Section: grid_config -
Unable to instantiate classifier {} with parameters {}, error thrown: {}
'''.format(classpath, parameters, e)))
except Exception as e:
raise ValueError(dedent('''Section: grid_config -
Unable to import classifier {}, error thrown: {}
'''.format(classpath, e)))
def val_tune_rf(estimator,x_train,y_train,x_val,y_val,params):
params_list = list(ParameterGrid(params))
print params_list
print y_val
results = []
for param in params_list:
print '========= ',param
estimator.set_params(**param)
estimator.fit(x_train,y_train)
preds_prob = estimator.predict_proba(x_val)
# print preds_prob[:,1]
result = roc_auc_score(y_val,preds_prob[:,1])
print 'roc_auc_score : %f'%result
results.append((param,result))
results.sort(key=lambda k: k[1])
print results
print results[-1]
def grid_search():
param_grid = {}
param_grid["embeddings"] = [
("data/pol/orth", "w2v_allwiki_nkjp300_300"),
("data/pol/lemma", "w2v_allwiki_nkjp300_300"),
("resources/pol/fasttext", "wiki.pl")
]
param_grid["optim"] = ["adam", "adagrad"]
param_grid['reweight'] = [True, False]
grid = ParameterGrid(param_grid)
filename = "results/{date:%Y%m%d_%H%M}_results.csv".format(date=datetime.now())
print('Starting a grid search through {n} parameter combinations'.format(
n=len(grid)))
for params in grid:
print(params)
with open(filename, "a") as results_file:
results_file.write(str(params) + ", ")
max_dev_epoch, max_dev, _ = train.main(params)
results_file.write('Epoch {epoch}, accuracy {acc:.4f}\n'.format(
epoch=max_dev_epoch,
acc=max_dev
))
def test_classification():
# Check classification for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"max_features": [1, 2, 4],
"bootstrap": [True, False],
"bootstrap_features": [True, False]})
for base_estimator in [None,
DummyClassifier(),
Perceptron(),
DecisionTreeClassifier(),
KNeighborsClassifier(),
SVC()]:
for params in grid:
BaggingClassifier(base_estimator=base_estimator,
random_state=rng,
**params).fit(X_train, y_train).predict(X_test)
def _create_batches(self):
param_iter = ParameterGrid(self.param_grid)
# divide work into batches equal to the communicator's size
work_batches = [[] for _ in range(comm_size)]
i = 0
for fold_id, (train_index, test_index) in enumerate(self.cv_iter):
for parameters in param_iter:
work_batches[i % comm_size].append((fold_id + 1, train_index,
test_index, parameters))
i += 1
return work_batches
def _generate_model_configs(self, grid_config):
"""Flattens a model/parameter grid configuration into individually
trainable model/parameter pairs
Yields: (tuple) classpath and parameters
"""
for class_path, parameter_config in grid_config.items():
for parameters in ParameterGrid(parameter_config):
yield class_path, parameters
def fit(self, frame):
"""Fit the grid search.
Parameters
----------
frame : H2OFrame, shape=(n_samples, n_features)
The training frame on which to fit.
"""
return self._fit(frame, ParameterGrid(self.param_grid))
def fit(self, X, y=None):
"""Run fit with all sets of parameters.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape = [n_samples] or [n_samples, n_output], optional
Target relative to X for classification or regression;
None for unsupervised learning.
"""
return self._fit(X, y, ParameterGrid(self.param_grid))
def __init__(self, experiment, args, job_module_config):
super(self.__class__, self).__init__(experiment, args, job_module_config)
# pre-format the experiment dict
# Sklearn needs all the params to be in a list for the grid to work
# properly
for param in experiment['params']:
if type(experiment['params'][param]) is not list:
experiment['params'][param] = [experiment['params'][param] ]
self.searcher = ParameterGrid(experiment['params'])
def create_parameter_grid(param_dict):
from sklearn.model_selection import ParameterGrid
return ParameterGrid(param_dict)
def _get_param_iterator(self):
"""Return ParameterGrid instance for the given param_grid"""
return model_selection.ParameterGrid(self.param_grid)
# ------------------ #
# RandomizedSearchCV #
# ------------------ #
def _generate_model_configs(self, grid_config):
"""Flattens a model/parameter grid configuration into individually
trainable model/parameter pairs
Yields: (tuple) classpath and parameters
"""
for class_path, parameter_config in grid_config.items():
for parameters in ParameterGrid(parameter_config):
yield class_path, parameters
def fit_binarized(self, X_featurized, Y_binarized, validation_data=None, **kwargs):
klass = get_class_from_module_path(self.classifier)
if validation_data is None: # use 0.2 for validation data
X_train, X_validation, Y_train, Y_validation = train_test_split(X_featurized, Y_binarized, test_size=self.validation_size)
logger.info('Using {} of training data ({} instances) for validation.'.format(self.validation_size, Y_validation.shape[0]))
else:
X_train, X_validation, Y_train, Y_validation = X_featurized, validation_data[0], Y_binarized, validation_data[1]
#end if
best_score, best_param = 0.0, None
if self.n_jobs > 1: logger.info('Performing hyperparameter gridsearch in parallel using {} jobs.'.format(self.n_jobs))
else: logger.debug('Performing hyperparameter gridsearch in parallel using {} jobs.'.format(self.n_jobs))
param_scores = Parallel(n_jobs=self.n_jobs)(delayed(_fit_classifier)(klass, self.classifier_args, param, self.metric, X_train, Y_train, X_validation, Y_validation) for param in ParameterGrid(self.param_grid))
best_param, best_score = max(param_scores, key=lambda x: x[1])
logger.info('Best scoring param is {} with score {}.'.format(best_param, best_score))
classifier_args = {}
classifier_args.update(self.classifier_args)
classifier_args.update(best_param)
self.classifier_ = klass(**classifier_args)
logger.info('Fitting final model <{}> on full data with param {}.'.format(self.classifier_, best_param))
self.classifier_.fit(X_featurized, Y_binarized)
return self
#end def
#end class
def test_parameter_grid():
# Test basic properties of ParameterGrid.
params1 = {"foo": [1, 2, 3]}
grid1 = ParameterGrid(params1)
assert_true(isinstance(grid1, Iterable))
assert_true(isinstance(grid1, Sized))
assert_equal(len(grid1), 3)
assert_grid_iter_equals_getitem(grid1)
params2 = {"foo": [4, 2],
"bar": ["ham", "spam", "eggs"]}
grid2 = ParameterGrid(params2)
assert_equal(len(grid2), 6)
# loop to assert we can iterate over the grid multiple times
for i in xrange(2):
# tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
points = set(tuple(chain(*(sorted(p.items())))) for p in grid2)
assert_equal(points,
set(("bar", x, "foo", y)
for x, y in product(params2["bar"], params2["foo"])))
assert_grid_iter_equals_getitem(grid2)
# Special case: empty grid (useful to get default estimator settings)
empty = ParameterGrid({})
assert_equal(len(empty), 1)
assert_equal(list(empty), [{}])
assert_grid_iter_equals_getitem(empty)
assert_raises(IndexError, lambda: empty[1])
has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}])
assert_equal(len(has_empty), 4)
assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}, {'C': .5}])
assert_grid_iter_equals_getitem(has_empty)
def test_parameters_sampler_replacement():
# raise error if n_iter too large
params = {'first': [0, 1], 'second': ['a', 'b', 'c']}
sampler = ParameterSampler(params, n_iter=7)
assert_raises(ValueError, list, sampler)
# degenerates to GridSearchCV if n_iter the same as grid_size
sampler = ParameterSampler(params, n_iter=6)
samples = list(sampler)
assert_equal(len(samples), 6)
for values in ParameterGrid(params):
assert_true(values in samples)
# test sampling without replacement in a large grid
params = {'a': range(10), 'b': range(10), 'c': range(10)}
sampler = ParameterSampler(params, n_iter=99, random_state=42)
samples = list(sampler)
assert_equal(len(samples), 99)
hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c'])
for p in samples]
assert_equal(len(set(hashable_samples)), 99)
# doesn't go into infinite loops
params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']}
sampler = ParameterSampler(params_distribution, n_iter=7)
samples = list(sampler)
assert_equal(len(samples), 7)
def test_spectral_coclustering():
# Test Dhillon's Spectral CoClustering on a simple problem.
param_grid = {'svd_method': ['randomized', 'arpack'],
'n_svd_vecs': [None, 20],
'mini_batch': [False, True],
'init': ['k-means++'],
'n_init': [10],
'n_jobs': [1]}
random_state = 0
S, rows, cols = make_biclusters((30, 30), 3, noise=0.5,
random_state=random_state)
S -= S.min() # needs to be nonnegative before making it sparse
S = np.where(S < 1, 0, S) # threshold some values
for mat in (S, csr_matrix(S)):
for kwargs in ParameterGrid(param_grid):
model = SpectralCoclustering(n_clusters=3,
random_state=random_state,
**kwargs)
model.fit(mat)
assert_equal(model.rows_.shape, (3, 30))
assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
assert_equal(consensus_score(model.biclusters_,
(rows, cols)), 1)
_test_shape_indices(model)
def _to_param_meta(param_grid, control):
'''Acquire parameter metadata such as bounds that are useful for sampling'''
choice_params = {k: v for k, v in param_grid.items()
if not hasattr(v, 'rvs')}
distributions = {k: v for k, v in param_grid.items()
if k not in choice_params}
pg_list = list(ParameterGrid(choice_params))
choices, low, high, param_order, is_int = [], [], [], [], []
is_continuous = lambda v: isinstance(v, numbers.Real)
while len(pg_list):
pg2 = pg_list.pop(0)
for k, v in pg2.items():
if k in param_order:
idx = param_order.index(k)
else:
idx = len(param_order)
param_order.append(k)
low.append(v)
high.append(v)
choices.append([v])
is_int.append(not is_continuous(v))
continue
if v not in choices[idx]:
choices[idx].append(v)
if is_continuous(v):
is_int[idx] = False
if v < low[idx]:
low[idx] = v
if v > high[idx]:
high[idx] = v
else:
is_int[idx] = True
low[idx] = high[idx] = v
for k, v in distributions.items():
choices.append(v)
low.append(None)
high.append(None)
is_int.append(False)
param_order.append(k)
param_meta = dict(control=control, high=high, low=low,
choices=choices, is_int=is_int,
param_order=param_order)
return param_meta
def clf_loop(self, X_train, X_test, y_train, y_test, individuals, setting):
'''
Runs through each model specified by models_to_run once with each possible
setting in params.
'''
N = 0
self.prepare_report()
for index, clf in enumerate([self.clfs[x] for x in self.models_to_run]):
iteration = 0
print('Running {}.'.format(self.models_to_run[index]))
parameter_values = self.params[self.models_to_run[index]]
grid = ParameterGrid(parameter_values)
while iteration < self.iterations_max and iteration < len(grid):
print(' Running Iteration {} of {}...'.format(iteration + 1, self.iterations_max))
if len(grid) > self.iterations_max:
p = random.choice(list(grid))
else:
p = list(grid)[iteration]
try:
m = Model(clf, X_train, y_train, X_test, y_test, p, N,
self.models_to_run[index], iteration,
self.output_dir, thresholds = self.thresholds,
ks = self.ks, report = self.report, label='label',
individuals=individuals, setting=setting)
m.run()
print(' Printing to file...')
if not self.roc:
m.performance_to_file()
else:
m.performance_to_file(roc='{}ROC_{}_{}-{}.png'.format(
self.output_dir, self.models_to_run[index], N,
iteration))
except IndexError as e:
print(p)
print(N)
print('IndexError: {}'.format(e))
print(traceback.format_exc())
continue
except RuntimeError as e:
print(p)
print(N)
print('RuntimeError: {}'.format(e))
print(traceback.format_exc())
continue
except AttributeError as e:
print(p)
print(N)
print('AttributeError: {}'.format(e))
print(traceback.format_exc())
continue
iteration += 1
N += 1