def test_logistic_regression_solvers_multiclass():
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
n_classes=3, random_state=0)
tol = 1e-6
ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol)
lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol)
lib = LogisticRegression(fit_intercept=False, tol=tol)
sag = LogisticRegression(solver='sag', fit_intercept=False, tol=tol,
max_iter=1000, random_state=42)
ncg.fit(X, y)
lbf.fit(X, y)
sag.fit(X, y)
lib.fit(X, y)
assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4)
assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4)
assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4)
assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4)
assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4)
assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4)
python类make_classification()的实例源码
def test_logreg_predict_proba_multinomial():
X, y = make_classification(n_samples=10, n_features=20, random_state=0,
n_classes=3, n_informative=10)
# Predicted probabilites using the true-entropy loss should give a
# smaller loss than those using the ovr method.
clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs")
clf_multi.fit(X, y)
clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs")
clf_ovr.fit(X, y)
clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X))
assert_greater(clf_ovr_loss, clf_multi_loss)
# Predicted probabilites using the soft-max function should give a
# smaller loss than those using the logistic function.
clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X))
assert_greater(clf_wrong_loss, clf_multi_loss)
def test_mean_variance_illegal_axis():
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_csr = sp.csr_matrix(X)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=2)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3,
last_mean=None, last_var=None, last_n=None)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2,
last_mean=None, last_var=None, last_n=None)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1,
last_mean=None, last_var=None, last_n=None)
def test_model_assessment():
X, y = make_classification(n_samples=40, n_features=100, n_informative=2,
n_classes=2, n_redundant=0)
pipe = Pipeline([('enet', ElasticNetFeatureSelection()),
('ridge', RidgeClassifier())])
ma = ModelAssessment(GridSearchCV(pipe, {'enet__l1_ratio': [2]})).fit(X, y)
assert len(ma.cv_results_) == 0
def test_db_logger(self):
X, y = datasets.make_classification(random_state=42)
task = Task("class_split", X, y, "classification", test_size=0.1, random_state=42)
scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
logger = DBLogger(task, self.engine)
optimizer = XGBoostOptimizer(task, scorer, logger)
optimizer.start_optimization(max_evals=10)
self.assertEqual(len(list(logger.load_all_results())), 10)
def test_file_logger(self):
X, y = datasets.make_classification(random_state=42)
task = Task("class_split", X, y, "classification", test_size=0.1, random_state=42)
scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
logger = FileLogger(task)
optimizer = XGBoostOptimizer(task, scorer, logger)
optimizer.start_optimization(max_evals=10)
self.assertEqual(len(list(logger.load_all_results())), 10)
os.remove(task.name + ".log")
def sk_generate_random_classification_set(self, samples, features, classes, informative, rds, dbs, debug=False):
record = {
"Test" : {
"X" : {},
"Y" : {}
},
"Train" : {
"X" : {},
"Y" : {}
}
}
results = self.build_def_hash("Display Error", "Not Run", record )
try:
from sklearn.datasets import make_classification
self.lg("Processing ROC", 6)
X, Y = make_classification(n_samples=samples,
n_features=features,
n_classes=classes,
n_informative=informative)
record["Test"]["X"] = X[9000:]
record["Test"]["Y"] = Y[9000:]
record["Train"]["X"] = X[:9000]
record["Train"]["Y"] = Y[:9000]
results = self.build_def_hash("SUCCESS", "", record)
except Exception,k:
status = "FAILED"
err_msg = "Unable to Generate Random Classification set with Ex(" + str(k) + ")"
self.lg("ERROR: " + str(err_msg), 0)
results = self.build_def_hash("Display Error", err_msg, {})
# end of try/ex
return results
# end of sk_generate_random_classification_set
def generate_multiclass_dataset(n_samples=100, n_features=10,
n_informative=5, n_redundant=3, n_repeated=2,
n_classes=2, n_clusters_per_class=2,
weights=None, flip_y=0.01, class_sep=1.0,
hypercube=True, shift=0.0, scale=1.0,
shuffle=True, random_state=None, hot_encoded=True, partitions_proportions=None,
negative_labels=-1.):
X, y = sk_dt.make_classification(n_samples=n_samples, n_features=n_features,
n_informative=n_informative, n_redundant=n_redundant, n_repeated=n_repeated,
n_classes=n_classes, n_clusters_per_class=n_clusters_per_class,
weights=weights, flip_y=flip_y, class_sep=class_sep,
hypercube=hypercube, shift=shift, scale=scale,
shuffle=True, random_state=random_state)
if hot_encoded:
y = to_one_hot_enc(y)
else:
y[y == 0] = negative_labels
res = Dataset(data=np.array(X, dtype=np.float32), target=np.array(y, dtype=np.float32),
info={'n_informative': n_informative, 'n_redundant': n_redundant,
'n_repeated': n_repeated,
'n_classes': n_classes, 'n_clusters_per_class': n_clusters_per_class,
'weights': weights, 'flip_y': flip_y, 'class_sep': class_sep,
'hypercube': hypercube, 'shift': shift, 'scale': scale,
'shuffle': True, 'random_state': random_state})
np.random.seed(random_state)
if partitions_proportions:
res = redivide_data([res], shuffle=shuffle, partition_proportions=partitions_proportions)
res = Datasets.from_list(res)
return res
def sk_generate_random_classification_set(self, samples, features, classes, informative, rds, dbs, debug=False):
record = {
"Test" : {
"X" : {},
"Y" : {}
},
"Train" : {
"X" : {},
"Y" : {}
}
}
results = self.build_def_hash("Display Error", "Not Run", record )
try:
from sklearn.datasets import make_classification
self.lg("Processing ROC", 6)
X, Y = make_classification(n_samples=samples,
n_features=features,
n_classes=classes,
n_informative=informative)
record["Test"]["X"] = X[9000:]
record["Test"]["Y"] = Y[9000:]
record["Train"]["X"] = X[:9000]
record["Train"]["Y"] = Y[:9000]
results = self.build_def_hash("SUCCESS", "", record)
except Exception,k:
status = "FAILED"
err_msg = "Unable to Generate Random Classification set with Ex(" + str(k) + ")"
self.lg("ERROR: " + str(err_msg), 0)
results = self.build_def_hash("Display Error", err_msg, {})
# end of try/ex
return results
# end of sk_generate_random_classification_set
def test_grid_search_dask_inputs():
# Numpy versions
np_X, np_y = make_classification(n_samples=15, n_classes=2, random_state=0)
np_groups = np.random.RandomState(0).randint(0, 3, 15)
# Dask array versions
da_X = da.from_array(np_X, chunks=5)
da_y = da.from_array(np_y, chunks=5)
da_groups = da.from_array(np_groups, chunks=5)
# Delayed versions
del_X = delayed(np_X)
del_y = delayed(np_y)
del_groups = delayed(np_groups)
cv = GroupKFold()
clf = SVC(random_state=0)
grid = {'C': [1]}
sol = SVC(C=1, random_state=0).fit(np_X, np_y).support_vectors_
for X, y, groups in product([np_X, da_X, del_X],
[np_y, da_y, del_y],
[np_groups, da_groups, del_groups]):
gs = dcv.GridSearchCV(clf, grid, cv=cv)
with pytest.raises(ValueError) as exc:
gs.fit(X, y)
assert "parameter should not be None" in str(exc.value)
gs.fit(X, y, groups=groups)
np.testing.assert_allclose(sol, gs.best_estimator_.support_vectors_)
def test_bad_error_score():
X, y = make_classification(n_samples=100, n_features=10, random_state=0)
gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
error_score='badparam')
with pytest.raises(ValueError):
gs.fit(X, y)
def test_cache_cv():
X, y = make_classification(n_samples=100, n_features=10, random_state=0)
X2 = X.view(CountTakes)
gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
cv=3, cache_cv=False, scheduler='sync')
gs.fit(X2, y)
assert X2.count == 2 * 3 * 3 # (1 train + 1 test) * n_params * n_splits
X2 = X.view(CountTakes)
assert X2.count == 0
gs.cache_cv = True
gs.fit(X2, y)
assert X2.count == 2 * 3 # (1 test + 1 train) * n_splits
def test_scheduler_param(scheduler, n_jobs, get):
if scheduler == 'multiprocessing':
mp = pytest.importorskip('dask.multiprocessing')
get = mp.get
assert _normalize_scheduler(scheduler, n_jobs) is get
X, y = make_classification(n_samples=100, n_features=10, random_state=0)
gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3,
scheduler=scheduler, n_jobs=n_jobs)
gs.fit(X, y)
def test_scheduler_param_distributed(loop):
X, y = make_classification(n_samples=100, n_features=10, random_state=0)
with cluster() as (s, [a, b]):
with Client(s['address'], loop=loop, set_as_default=False) as client:
gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
cv=3, scheduler=client)
gs.fit(X, y)
def test_cv_multiplemetrics_requires_refit_metric():
X, y = make_classification(random_state=0)
param_grid = {'max_depth': [1, 5]}
a = dcv.GridSearchCV(RandomForestClassifier(), param_grid, refit=True,
scoring={'score1': 'accuracy', 'score2': 'accuracy'})
with pytest.raises(ValueError):
a.fit(X, y)
def test_cv_multiplemetrics_no_refit():
X, y = make_classification(random_state=0)
param_grid = {'max_depth': [1, 5]}
a = dcv.GridSearchCV(RandomForestClassifier(), param_grid, refit=False,
scoring={'score1': 'accuracy', 'score2': 'accuracy'})
b = GridSearchCV(RandomForestClassifier(), param_grid, refit=False,
scoring={'score1': 'accuracy', 'score2': 'accuracy'})
assert hasattr(a, 'best_index_') is hasattr(b, 'best_index_')
assert hasattr(a, 'best_estimator_') is hasattr(b, 'best_estimator_')
assert hasattr(a, 'best_score_') is hasattr(b, 'best_score_')
def make_test_data():
from sklearn.datasets import make_classification
import pandas as pd
data = make_classification(n_samples=3, n_features=4)
data = data[0]
df = pd.DataFrame(data, columns=list("ABCD"))
prepare_path(test_data_file)
df.to_csv(test_data_file, sep='\t', index=False)
def test_partial_fit_equivalence():
X, y = make_regression(random_state=0, n_samples=100)
mtr = MondrianTreeRegressor(random_state=0)
mtr.partial_fit(X, y)
for batch_size in [10, 20, 25, 50, 90]:
check_partial_fit_equivalence(batch_size, mtr, 0, X, y)
X, y = make_classification(random_state=0, n_samples=100)
mtc = MondrianTreeClassifier(random_state=0)
mtc.partial_fit(X, y)
for batch_size in [10, 20, 25, 50, 90]:
check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
def test_partial_fit_equivalence():
X, y = make_regression(random_state=0, n_samples=100)
mfr = MondrianForestRegressor(random_state=0)
mfr.partial_fit(X, y)
for batch_size in [10, 20, 25, 50, 90]:
check_partial_fit_equivalence(batch_size, mfr, 0, X, y)
X, y = make_classification(random_state=0, n_samples=100)
mtc = MondrianForestClassifier(random_state=0)
mtc.partial_fit(X, y)
for batch_size in [10, 20, 25, 50, 90]:
check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
def get_sample_dataset(dataset_properties):
"""Returns sample dataset
Args:
dataset_properties (dict): Dictionary corresponding to the properties of the dataset
used to verify the estimator and metric generators.
Returns:
X (array-like): Features array
y (array-like): Labels array
splits (iterator): This is an iterator that returns train test splits for
cross-validation purposes on ``X`` and ``y``.
"""
kwargs = dataset_properties.copy()
data_type = kwargs.pop('type')
if data_type == 'multiclass':
try:
X, y = datasets.make_classification(random_state=8, **kwargs)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
except Exception as e:
raise exceptions.UserError(repr(e))
elif data_type == 'iris':
X, y = datasets.load_iris(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'mnist':
X, y = datasets.load_digits(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'breast_cancer':
X, y = datasets.load_breast_cancer(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'boston':
X, y = datasets.load_boston(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
elif data_type == 'diabetes':
X, y = datasets.load_diabetes(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
else:
raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
return X, y, splits