def test_feature_union_fit_failure():
X, y = make_classification(n_samples=100, n_features=10, random_state=0)
pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()),
('bad', FailingClassifier())],
transformer_weights={'bad': 0.5})),
('clf', MockClassifier())])
grid = {'union__bad__parameter': [0, 1, 2]}
gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=None)
# Check that failure raises if error_score is `'raise'`
with pytest.raises(ValueError):
gs.fit(X, y)
# Check that grid scores were set to error_score on failure
gs.error_score = float('nan')
with pytest.warns(FitFailedWarning):
gs.fit(X, y)
check_scores_all_nan(gs, 'union__bad__parameter')
python类make_classification()的实例源码
def test_improvement(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
hyperopt = HyperoptOptimizer(model, [p1], clf_score)
best_params, best_model = hyperopt.fit(X_train=data, y_train=target, n_iters=10)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
for status in hyperopt.trials.statuses():
self.assertEqual(status, 'ok')
def test_improvement(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
grid_sizes = {'max_depth': 5}
grid_search = GridSearchOptimizer(model, [p1], clf_score, grid_sizes)
best_params, best_model = grid_search.fit(X_train=data, y_train=target)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def test_objective_function(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=10,
n_informative=10,
n_redundant=0,
class_sep=100,
n_clusters_per_class=1,
flip_y=0.0)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
fun = partial(objective, model,
'sklearn',
clf_score,
data, target, data, target)
# model should fit the data perfectly
final_score = fun(model.get_params())[0]
self.assertEqual(final_score,1)
def test_improvement(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
n_init_samples = 4
mutation_noise = {'max_depth': 0.4, 'learning_rate': 0.05,
'reg_lambda':0.5}
geneticOpt = GeneticOptimizer(model, [p1], clf_score, n_init_samples,
'RouletteWheel', mutation_noise)
best_params, best_model = geneticOpt.fit(X_train=data, y_train=target, n_iters=30)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def test_expected_improvement_tractable(self):
np.random.seed(5)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='expected_improvement')
best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
self.assertTrue(bayesOpt.success)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def test_upper_confidence_bound_tractable(self):
np.random.seed(5)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='upper_confidence_bound')
best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
self.assertTrue(bayesOpt.success)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def test_improvement(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
rand_search = RandomSearchOptimizer(model, [p1], clf_score)
best_params, best_model = rand_search.fit(X_train=data, y_train=target, n_iters=10)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def setUp(self):
os.putenv("KMP_DUPLICATE_LIB_OK", "TRUE")
self.X_class, self.y_class = datasets.make_classification(random_state=42)
self.X_reg, self.y_reg = datasets.make_regression(random_state=42)
self.classification_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
self.regression_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
self.class_scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
self.reg_scorer = Scorer("mse", metrics.mean_squared_error)
self.classification_task_split = \
Task("class_split", self.X_class, self.y_class, "classification", test_size=0.1, random_state=42)
self.regression_task_split = \
Task("reg_split", self.X_class, self.y_class, "regression", test_size=0.1, random_state=42)
self.classification_task_cv = \
Task("class_cv", self.X_reg, self.y_reg, "classification", cv=5, random_state=42)
self.regression_task_cv = \
Task("reg_cv", self.X_reg, self.y_reg, "regression", cv=5, random_state=42)
def case2():
from sklearn.datasets import make_classification
x,y = make_classification(n_samples=1000, n_features=2,n_redundant=0,n_informative=1,n_clusters_per_class=1)
print len(x)
print len(y)
print x
print y
for i in range(len(x)):
print x[i],y[i]
x_data_train = x[:800,:]
x_data_test = x[800:,:]
y_data_train = y[:800]
y_data_test = y[800:]
print '*'*20
print x_data_train
print x_data_test
print y_data_train
print y_data_test
print x[0,0]
def test_visualize():
pytest.importorskip('graphviz')
X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
random_state=0)
clf = SVC(random_state=0)
grid = {'C': [.1, .5, .9]}
gs = dcv.GridSearchCV(clf, grid).fit(X, y)
assert hasattr(gs, 'dask_graph_')
with tmpdir() as d:
gs.visualize(filename=os.path.join(d, 'mydask'))
assert os.path.exists(os.path.join(d, 'mydask.png'))
# Doesn't work if not fitted
gs = dcv.GridSearchCV(clf, grid)
with pytest.raises(NotFittedError):
gs.visualize()
def test_feature_union_fit_failure_multiple_metrics():
scoring = {"score_1": _passthrough_scorer, "score_2": _passthrough_scorer}
X, y = make_classification(n_samples=100, n_features=10, random_state=0)
pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()),
('bad', FailingClassifier())],
transformer_weights={'bad': 0.5})),
('clf', MockClassifier())])
grid = {'union__bad__parameter': [0, 1, 2]}
gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=scoring)
# Check that failure raises if error_score is `'raise'`
with pytest.raises(ValueError):
gs.fit(X, y)
# Check that grid scores were set to error_score on failure
gs.error_score = float('nan')
with pytest.warns(FitFailedWarning):
gs.fit(X, y)
for key in scoring:
check_scores_all_nan(gs, 'union__bad__parameter', score_key=key)
def test_pipeline_fit_failure():
X, y = make_classification(n_samples=100, n_features=10, random_state=0)
pipe = Pipeline([('bad', FailingClassifier()),
('good1', MockClassifier()),
('good2', MockClassifier())])
grid = {'bad__parameter': [0, 1, 2]}
gs = dcv.GridSearchCV(pipe, grid, refit=False)
# Check that failure raises if error_score is `'raise'`
with pytest.raises(ValueError):
gs.fit(X, y)
# Check that grid scores were set to error_score on failure
gs.error_score = float('nan')
with pytest.warns(FitFailedWarning):
gs.fit(X, y)
check_scores_all_nan(gs, 'bad__parameter')
def test_feature_union_raises():
X, y = make_classification(n_samples=100, n_features=10, random_state=0)
union = FeatureUnion([('tr0', MockClassifier()),
('tr1', MockClassifier())])
pipe = Pipeline([('union', union), ('est', MockClassifier())])
grid = {'union__tr2__parameter': [0, 1, 2]}
gs = dcv.GridSearchCV(pipe, grid, refit=False)
with pytest.raises(ValueError):
gs.fit(X, y)
grid = {'union__transformer_list': [[('one', MockClassifier())]]}
gs = dcv.GridSearchCV(pipe, grid, refit=False)
with pytest.raises(NotImplementedError):
gs.fit(X, y)
def dataset_generator():
"""
generate dataset for binary classification
:return:
"""
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable
]
X, y = datasets[0]
y[y == 0] = -1
X = StandardScaler().fit_transform(X)
return X, y
def classification():
# Generate a random binary classification problem.
X, y = make_classification(n_samples=350, n_features=15, n_informative=10,
random_state=1111, n_classes=2,
class_sep=1., n_redundant=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
random_state=1111)
model = GradientBoostingClassifier(n_estimators=50, max_depth=4,
max_features=8, learning_rate=0.1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(predictions)
print(predictions.min())
print(predictions.max())
print('classification, roc auc score: %s'
% roc_auc_score(y_test, predictions))
def test_importances_gini_equal_mse():
# Check that gini is equivalent to mse for binary output variable
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
# The gini index and the mean square error (variance) might differ due
# to numerical instability. Since those instabilities mainly occurs at
# high tree depth, we restrict this maximal depth.
clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
random_state=0).fit(X, y)
reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
random_state=0).fit(X, y)
assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
assert_array_equal(clf.tree_.feature, reg.tree_.feature)
assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
def test_importances():
# Check variable importances.
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=1)
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(X, y)
importances = clf.feature_importances_
assert_equal(importances.shape[0], 10)
assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
True)
def test_grid_search_labels():
# Check if ValueError (when labels is None) propagates to GridSearchCV
# And also check if labels is correctly passed to the cv object
rng = np.random.RandomState(0)
X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
labels = rng.randint(0, 3, 15)
clf = LinearSVC(random_state=0)
grid = {'C': [1]}
label_cvs = [LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(),
LabelShuffleSplit()]
for cv in label_cvs:
gs = GridSearchCV(clf, grid, cv=cv)
assert_raise_message(ValueError,
"The labels parameter should not be None",
gs.fit, X, y)
gs.fit(X, y, labels)
non_label_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
for cv in non_label_cvs:
gs = GridSearchCV(clf, grid, cv=cv)
# Should not raise an error
gs.fit(X, y)
def test_grid_search_sparse():
# Test that grid search works with both dense and sparse matrices
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(X_[:180], y_[:180])
y_pred = cv.predict(X_[180:])
C = cv.best_estimator_.C
X_ = sp.csr_matrix(X_)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(X_[:180].tocoo(), y_[:180])
y_pred2 = cv.predict(X_[180:])
C2 = cv.best_estimator_.C
assert_true(np.mean(y_pred == y_pred2) >= .9)
assert_equal(C, C2)
def test_learning_curve():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
estimator = MockImprovingEstimator(20)
with warnings.catch_warnings(record=True) as w:
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
if len(w) > 0:
raise RuntimeError("Unexpected warning: %r" % w[0].message)
assert_equal(train_scores.shape, (10, 3))
assert_equal(test_scores.shape, (10, 3))
assert_array_equal(train_sizes, np.linspace(2, 20, 10))
assert_array_almost_equal(train_scores.mean(axis=1),
np.linspace(1.9, 1.0, 10))
assert_array_almost_equal(test_scores.mean(axis=1),
np.linspace(0.1, 1.0, 10))
def test_learning_curve_verbose():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
estimator = MockImprovingEstimator(20)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
train_sizes, train_scores, test_scores = \
learning_curve(estimator, X, y, cv=3, verbose=1)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
assert("[learning_curve]" in out)
def test_learning_curve_batch_and_incremental_learning_are_equal():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
train_sizes = np.linspace(0.2, 1.0, 5)
estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False)
train_sizes_inc, train_scores_inc, test_scores_inc = \
learning_curve(
estimator, X, y, train_sizes=train_sizes,
cv=3, exploit_incremental_learning=True)
train_sizes_batch, train_scores_batch, test_scores_batch = \
learning_curve(
estimator, X, y, cv=3, train_sizes=train_sizes,
exploit_incremental_learning=False)
assert_array_equal(train_sizes_inc, train_sizes_batch)
assert_array_almost_equal(train_scores_inc.mean(axis=1),
train_scores_batch.mean(axis=1))
assert_array_almost_equal(test_scores_inc.mean(axis=1),
test_scores_batch.mean(axis=1))
def test_grid_search_sparse():
# Test that grid search works with both dense and sparse matrices
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(X_[:180], y_[:180])
y_pred = cv.predict(X_[180:])
C = cv.best_estimator_.C
X_ = sp.csr_matrix(X_)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(X_[:180].tocoo(), y_[:180])
y_pred2 = cv.predict(X_[180:])
C2 = cv.best_estimator_.C
assert_true(np.mean(y_pred == y_pred2) >= .9)
assert_equal(C, C2)
def test_learning_curve():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
estimator = MockImprovingEstimator(20)
with warnings.catch_warnings(record=True) as w:
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
if len(w) > 0:
raise RuntimeError("Unexpected warning: %r" % w[0].message)
assert_equal(train_scores.shape, (10, 3))
assert_equal(test_scores.shape, (10, 3))
assert_array_equal(train_sizes, np.linspace(2, 20, 10))
assert_array_almost_equal(train_scores.mean(axis=1),
np.linspace(1.9, 1.0, 10))
assert_array_almost_equal(test_scores.mean(axis=1),
np.linspace(0.1, 1.0, 10))
def test_learning_curve_verbose():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
estimator = MockImprovingEstimator(20)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
train_sizes, train_scores, test_scores = \
learning_curve(estimator, X, y, cv=3, verbose=1)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
assert("[learning_curve]" in out)
def test_learning_curve_batch_and_incremental_learning_are_equal():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
train_sizes = np.linspace(0.2, 1.0, 5)
estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False)
train_sizes_inc, train_scores_inc, test_scores_inc = \
learning_curve(
estimator, X, y, train_sizes=train_sizes,
cv=3, exploit_incremental_learning=True)
train_sizes_batch, train_scores_batch, test_scores_batch = \
learning_curve(
estimator, X, y, cv=3, train_sizes=train_sizes,
exploit_incremental_learning=False)
assert_array_equal(train_sizes_inc, train_sizes_batch)
assert_array_almost_equal(train_scores_inc.mean(axis=1),
train_scores_batch.mean(axis=1))
assert_array_almost_equal(test_scores_inc.mean(axis=1),
test_scores_batch.mean(axis=1))
def test_l1_ratio():
# Test if l1 ratio extremes match L1 and L2 penalty settings.
X, y = datasets.make_classification(n_samples=1000,
n_features=100, n_informative=20,
random_state=1234)
# test if elasticnet with l1_ratio near 1 gives same result as pure l1
est_en = SGDClassifier(alpha=0.001, penalty='elasticnet',
l1_ratio=0.9999999999, random_state=42).fit(X, y)
est_l1 = SGDClassifier(alpha=0.001, penalty='l1', random_state=42).fit(X, y)
assert_array_almost_equal(est_en.coef_, est_l1.coef_)
# test if elasticnet with l1_ratio near 0 gives same result as pure l2
est_en = SGDClassifier(alpha=0.001, penalty='elasticnet',
l1_ratio=0.0000000001, random_state=42).fit(X, y)
est_l2 = SGDClassifier(alpha=0.001, penalty='l2', random_state=42).fit(X, y)
assert_array_almost_equal(est_en.coef_, est_l2.coef_)
def test_liblinear_dual_random_state():
# random_state is relevant for liblinear solver only if dual=True
X, y = make_classification(n_samples=20)
lr1 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15)
lr1.fit(X, y)
lr2 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15)
lr2.fit(X, y)
lr3 = LogisticRegression(random_state=8, dual=True, max_iter=1, tol=1e-15)
lr3.fit(X, y)
# same result for same random state
assert_array_almost_equal(lr1.coef_, lr2.coef_)
# different results for different random states
msg = "Arrays are not almost equal to 6 decimals"
assert_raise_message(AssertionError, msg,
assert_array_almost_equal, lr1.coef_, lr3.coef_)
def test_logistic_regression_solvers():
X, y = make_classification(n_features=10, n_informative=5, random_state=0)
ncg = LogisticRegression(solver='newton-cg', fit_intercept=False)
lbf = LogisticRegression(solver='lbfgs', fit_intercept=False)
lib = LogisticRegression(fit_intercept=False)
sag = LogisticRegression(solver='sag', fit_intercept=False,
random_state=42)
ncg.fit(X, y)
lbf.fit(X, y)
sag.fit(X, y)
lib.fit(X, y)
assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3)
assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3)
assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3)
assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3)
assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3)
assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3)