def __init__(self, l1_ratios=(0.4, 0.8, 0.95), num_alphas=30,
eps=1e-5, random_state=None, strategies=None, target_score=0.01,
n_tail=5, decision="min", max_complexity=50,
exponents=[1, 2], operators={}, n_jobs=1, rational=True, **kw):
self.l1_ratios = l1_ratios
self.num_alphas = num_alphas
self.eps = eps
self.random_state = check_random_state(random_state)
self.strategies = strategies
self.target_score = target_score
self.n_tail = n_tail
self.exponents = exponents
self.operators = operators
self.kw = kw
self.decision = decision
self.max_complexity = max_complexity
self.n_jobs = n_jobs
self.rational = rational
python类check_random_state()的实例源码
def create(cls, random_state=None):
"""
Each gene is picked with a uniform distribution from all allowed inputs or functions.
:param random_state: an instance of np.random.RandomState, a seed integer or None
:return: A random new class instance.
"""
random_state = check_random_state(random_state)
n_in = len(cls.pset.terminals)
operator_keys = list(range(n_in, max(cls.pset.mapping) + 1))
code = []
for i in range(cls.n_columns):
column = []
for j in range(cls.n_rows):
index = _code_index(n_in, cls.n_rows, i, j)
in_ = cls._valid_inputs[index]
gene = [random_state.choice(operator_keys)] + [random_state.choice(in_) for _ in range(cls.pset.max_arity)]
column.append(gene)
code.append(column)
outputs = [random_state.choice(cls._valid_inputs[_out_index(cls.n_rows, cls.n_columns, n_in, o)])
for o in range(cls.n_out)]
return cls(code, outputs)
def test_KNeighborsRegressor_multioutput_uniform_weight():
# Test k-neighbors in multi-output regression with uniform weight
rng = check_random_state(0)
n_features = 5
n_samples = 40
n_output = 4
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples, n_output)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
for algorithm, weights in product(ALGORITHMS, [None, 'uniform']):
knn = neighbors.KNeighborsRegressor(weights=weights,
algorithm=algorithm)
knn.fit(X_train, y_train)
neigh_idx = knn.kneighbors(X_test, return_distance=False)
y_pred_idx = np.array([np.mean(y_train[idx], axis=0)
for idx in neigh_idx])
y_pred = knn.predict(X_test)
assert_equal(y_pred.shape, y_test.shape)
assert_equal(y_pred_idx.shape, y_test.shape)
assert_array_almost_equal(y_pred, y_pred_idx)
def test_parallel_train():
rng = check_random_state(12321)
n_samples, n_features = 80, 30
X_train = rng.randn(n_samples, n_features)
y_train = rng.randint(0, 2, n_samples)
clfs = [
RandomForestClassifier(n_estimators=20, n_jobs=n_jobs,
random_state=12345).fit(X_train, y_train)
for n_jobs in [1, 2, 3, 8, 16, 32]
]
X_test = rng.randn(n_samples, n_features)
probas = [clf.predict_proba(X_test) for clf in clfs]
for proba1, proba2 in zip(probas, probas[1:]):
assert_array_almost_equal(proba1, proba2)
def test_multioutput_regression_invariance_to_dimension_shuffling():
# test invariance to dimension shuffling
random_state = check_random_state(0)
y_true = random_state.uniform(0, 2, size=(20, 5))
y_pred = random_state.uniform(0, 2, size=(20, 5))
for name in MULTIOUTPUT_METRICS:
metric = ALL_METRICS[name]
error = metric(y_true, y_pred)
for _ in range(3):
perm = random_state.permutation(y_true.shape[1])
assert_almost_equal(metric(y_true[:, perm], y_pred[:, perm]),
error,
err_msg="%s is not dimension shuffling "
"invariant" % name)
def check_zero_or_all_relevant_labels(lrap_score):
random_state = check_random_state(0)
for n_labels in range(2, 5):
y_score = random_state.uniform(size=(1, n_labels))
y_score_ties = np.zeros_like(y_score)
# No relevant labels
y_true = np.zeros((1, n_labels))
assert_equal(lrap_score(y_true, y_score), 1.)
assert_equal(lrap_score(y_true, y_score_ties), 1.)
# Only relevant labels
y_true = np.ones((1, n_labels))
assert_equal(lrap_score(y_true, y_score), 1.)
assert_equal(lrap_score(y_true, y_score_ties), 1.)
# Degenerate case: only one label
assert_almost_equal(lrap_score([[1], [0], [1], [0]],
[[0.5], [0.5], [0.5], [0.5]]), 1.)
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
n_samples=20, random_state=0):
_, y_true = make_multilabel_classification(n_features=1,
allow_unlabeled=False,
random_state=random_state,
n_classes=n_classes,
n_samples=n_samples)
# Score with ties
y_score = sparse_random_matrix(n_components=y_true.shape[0],
n_features=y_true.shape[1],
random_state=random_state)
if hasattr(y_score, "toarray"):
y_score = y_score.toarray()
score_lrap = label_ranking_average_precision_score(y_true, y_score)
score_my_lrap = _my_lrap(y_true, y_score)
assert_almost_equal(score_lrap, score_my_lrap)
# Uniform score
random_state = check_random_state(random_state)
y_score = random_state.uniform(size=(n_samples, n_classes))
score_lrap = label_ranking_average_precision_score(y_true, y_score)
score_my_lrap = _my_lrap(y_true, y_score)
assert_almost_equal(score_lrap, score_my_lrap)
def _check_inputs(self, X, accept_sparse_negative=False):
if isinstance(X, (pd.DataFrame, dd.DataFrame)):
X = X.values
if isinstance(X, np.ndarray):
C = len(X) // min(multiprocessing.cpu_count(), 2)
X = da.from_array(X, chunks=C)
rng = check_random_state(self.random_state)
# TODO: non-float dtypes?
# TODO: sparse arrays?
# TODO: mix of sparse, dense?
sample = rng.uniform(size=(5, X.shape[1])).astype(X.dtype)
super(QuantileTransformer, self)._check_inputs(
sample, accept_sparse_negative=accept_sparse_negative)
return X
def __init__(self, configuration, task, random_state=None):
self.configuration = configuration
self.task = task
self._output_dtype = np.float32
if random_state is None:
self.random_state = check_random_state(1)
else:
self.random_state = check_random_state(random_state)
def setUp(self):
iris = datasets.load_iris()
rng = check_random_state(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
self.iris = iris
def setUp(self):
iris = datasets.load_iris()
rng = check_random_state(0)
iris.data = iris.data
iris.target = iris.target
self.iris = iris
for csv_file in glob.glob("*.csv"):
os.remove(csv_file)
def __init__(self, operators=None, n_const=0, n_rows=1, n_columns=3, n_back=1, max_iter=1000,
max_nfev=10000, lambda_=4, f_tol=0, seed=None, random_state=None, n_jobs=1, metric=mean_squared_error):
"""
:param operators: list of primitive excluding terminals
:param n_const: number of symbolic constants
:param n_rows: number of rows in the code block
:param n_columns: number of columns in the code block
:param n_back: number of rows to look back for connections
:param metric: what to optimize for
:param fun: `callable(individual)`, function to be optimized
:param random_state: an instance of np.random.RandomState, a seed integer or None
:param cls: The base class for individuals
:type cls: (optional) instance of cartesian.cgp.Cartesian
:param seed: (optional) can be passed instead of cls.
:param lambda_: number of offspring per generation
:param max_iter: maximum number of generations
:param max_nfev: maximum number of function evaluations. Important, if fun is another optimizer
:param f_tol: threshold for precision
:param n_jobs: number of jobs for joblib embarrassingly easy parallel
"""
self.operators = DEFAULT_PRIMITIVES or operators
self.constants = [Constant("c_{}".format(i)) for i in range(n_const)]
self.n_rows = n_rows
self.n_back = n_back
self.n_columns = n_columns
self.n_out = None
self.pset = None
self.res = None
self.model = None
# parameters for algorithm
self.max_nfev = max_nfev
self.max_iter = max_iter
self.lambda_ = lambda_
self.f_tol = f_tol
self.metric = metric
self.random_state = check_random_state(random_state)
self.n_jobs = n_jobs
self.seed = seed
def point_mutation(individual, random_state=None):
"""
Randomly pick a gene in individual and mutate it.
The mutation is either rewiring, i.e. changing the inputs, or changing the operator (head of gene)
:param individual: instance of Base
:type individual: instance of Cartesian
:param random_state: an instance of np.random.RandomState, a seed integer or None
:return: new instance of Base
"""
random_state = check_random_state(random_state)
n_terminals = len(individual.pset.terminals)
i = random_state.randint(n_terminals, len(individual) - 1)
el, c, r, l = individual.mapping[i]
gene = l[r]
if isinstance(gene, list):
new_gene = gene[:]
j = random_state.randint(0, len(gene))
if j == 0: # function
new_j = individual.pset.imapping[random_state.choice(individual.pset.operators)]
else: # input
new_j = random_state.choice(individual._valid_inputs[i])
new_gene[j] = new_j
else: # output gene
new_gene = random_state.randint(0, len(individual) - individual.n_out - 1)
new_individual = copy.copy(individual)
new_individual[i] = new_gene
return new_individual
def test_algorithm_success(individual):
cls = type(individual)
fun = lambda x: 0
rng = check_random_state(0)
res = oneplus(fun, random_state=rng, lambda_=4, max_iter=2, f_tol=-1, cls=cls)
assert res.success == False
res = oneplus(fun, random_state=rng, lambda_=4, max_iter=0, f_tol=0, cls=cls)
assert res.success == True
res = oneplus(fun, random_state=rng, lambda_=4, max_nfev=1, f_tol=-1, cls=cls)
assert res.success == False
def test_only_constant_features():
random_state = check_random_state(0)
X = np.zeros((10, 20))
y = random_state.randint(0, 2, (10, ))
for name, TreeEstimator in ALL_TREES.items():
est = TreeEstimator(random_state=0)
est.fit(X, y)
assert_equal(est.tree_.max_depth, 0)
def test_RadiusNeighborsClassifier_multioutput():
# Test k-NN classifier on multioutput data
rng = check_random_state(0)
n_features = 2
n_samples = 40
n_output = 3
X = rng.rand(n_samples, n_features)
y = rng.randint(0, 3, (n_samples, n_output))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
weights = [None, 'uniform', 'distance', _weight_func]
for algorithm, weights in product(ALGORITHMS, weights):
# Stack single output prediction
y_pred_so = []
for o in range(n_output):
rnn = neighbors.RadiusNeighborsClassifier(weights=weights,
algorithm=algorithm)
rnn.fit(X_train, y_train[:, o])
y_pred_so.append(rnn.predict(X_test))
y_pred_so = np.vstack(y_pred_so).T
assert_equal(y_pred_so.shape, y_test.shape)
# Multioutput prediction
rnn_mo = neighbors.RadiusNeighborsClassifier(weights=weights,
algorithm=algorithm)
rnn_mo.fit(X_train, y_train)
y_pred_mo = rnn_mo.predict(X_test)
assert_equal(y_pred_mo.shape, y_test.shape)
assert_array_almost_equal(y_pred_mo, y_pred_so)
def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight():
# Test radius neighbors in multi-output regression (uniform weight)
rng = check_random_state(0)
n_features = 5
n_samples = 40
n_output = 4
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples, n_output)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
for algorithm, weights in product(ALGORITHMS, [None, 'uniform']):
rnn = neighbors. RadiusNeighborsRegressor(weights=weights,
algorithm=algorithm)
rnn.fit(X_train, y_train)
neigh_idx = rnn.radius_neighbors(X_test, return_distance=False)
y_pred_idx = np.array([np.mean(y_train[idx], axis=0)
for idx in neigh_idx])
y_pred_idx = np.array(y_pred_idx)
y_pred = rnn.predict(X_test)
assert_equal(y_pred_idx.shape, y_test.shape)
assert_equal(y_pred.shape, y_test.shape)
assert_array_almost_equal(y_pred, y_pred_idx)
def test_average_precision_score_score_non_binary_class():
# Test that average_precision_score function returns an error when trying
# to compute average_precision_score for multiclass task.
rng = check_random_state(404)
y_pred = rng.rand(10)
# y_true contains three different class values
y_true = rng.randint(0, 3, size=10)
assert_raise_message(ValueError, "multiclass format is not supported",
average_precision_score, y_true, y_pred)
def test_symmetry():
# Test the symmetry of score and loss functions
random_state = check_random_state(0)
y_true = random_state.randint(0, 2, size=(20, ))
y_pred = random_state.randint(0, 2, size=(20, ))
# We shouldn't forget any metrics
assert_equal(set(SYMMETRIC_METRICS).union(
NOT_SYMMETRIC_METRICS, THRESHOLDED_METRICS,
METRIC_UNDEFINED_BINARY_MULTICLASS), set(ALL_METRICS))
assert_equal(
set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)),
set([]))
# Symmetric metric
for name in SYMMETRIC_METRICS:
metric = ALL_METRICS[name]
assert_almost_equal(metric(y_true, y_pred),
metric(y_pred, y_true),
err_msg="%s is not symmetric" % name)
# Not symmetric metrics
for name in NOT_SYMMETRIC_METRICS:
metric = ALL_METRICS[name]
assert_true(np.any(metric(y_true, y_pred) != metric(y_pred, y_true)),
msg="%s seems to be symmetric" % name)
def test_sample_order_invariance_multilabel_and_multioutput():
random_state = check_random_state(0)
# Generate some data
y_true = random_state.randint(0, 2, size=(20, 25))
y_pred = random_state.randint(0, 2, size=(20, 25))
y_score = random_state.normal(size=y_true.shape)
y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(y_true,
y_pred,
y_score,
random_state=0)
for name in MULTILABELS_METRICS:
metric = ALL_METRICS[name]
assert_almost_equal(metric(y_true, y_pred),
metric(y_true_shuffle, y_pred_shuffle),
err_msg="%s is not sample order invariant"
% name)
for name in THRESHOLDED_MULTILABEL_METRICS:
metric = ALL_METRICS[name]
assert_almost_equal(metric(y_true, y_score),
metric(y_true_shuffle, y_score_shuffle),
err_msg="%s is not sample order invariant"
% name)
for name in MULTIOUTPUT_METRICS:
metric = ALL_METRICS[name]
assert_almost_equal(metric(y_true, y_score),
metric(y_true_shuffle, y_score_shuffle),
err_msg="%s is not sample order invariant"
% name)
assert_almost_equal(metric(y_true, y_pred),
metric(y_true_shuffle, y_pred_shuffle),
err_msg="%s is not sample order invariant"
% name)
def test_normalize_option_binary_classification(n_samples=20):
# Test in the binary case
random_state = check_random_state(0)
y_true = random_state.randint(0, 2, size=(n_samples, ))
y_pred = random_state.randint(0, 2, size=(n_samples, ))
for name in METRICS_WITH_NORMALIZE_OPTION:
metrics = ALL_METRICS[name]
measure = metrics(y_true, y_pred, normalize=True)
assert_greater(measure, 0,
msg="We failed to test correctly the normalize option")
assert_almost_equal(metrics(y_true, y_pred, normalize=False)
/ n_samples, measure)
def test_normalize_option_multiclasss_classification():
# Test in the multiclass case
random_state = check_random_state(0)
y_true = random_state.randint(0, 4, size=(20, ))
y_pred = random_state.randint(0, 4, size=(20, ))
n_samples = y_true.shape[0]
for name in METRICS_WITH_NORMALIZE_OPTION:
metrics = ALL_METRICS[name]
measure = metrics(y_true, y_pred, normalize=True)
assert_greater(measure, 0,
msg="We failed to test correctly the normalize option")
assert_almost_equal(metrics(y_true, y_pred, normalize=False)
/ n_samples, measure)
def test_averaging_multiclass(n_samples=50, n_classes=3):
random_state = check_random_state(0)
y_true = random_state.randint(0, n_classes, size=(n_samples, ))
y_pred = random_state.randint(0, n_classes, size=(n_samples, ))
y_score = random_state.uniform(size=(n_samples, n_classes))
lb = LabelBinarizer().fit(y_true)
y_true_binarize = lb.transform(y_true)
y_pred_binarize = lb.transform(y_pred)
for name in METRICS_WITH_AVERAGING:
yield (check_averaging, name, y_true, y_true_binarize, y_pred,
y_pred_binarize, y_score)
def test_auc_score_non_binary_class():
# Test that roc_auc_score function returns an error when trying
# to compute AUC for non-binary class values.
rng = check_random_state(404)
y_pred = rng.rand(10)
# y_true contains only one class value
y_true = np.zeros(10, dtype="int")
assert_raise_message(ValueError, "ROC AUC score is not defined",
roc_auc_score, y_true, y_pred)
y_true = np.ones(10, dtype="int")
assert_raise_message(ValueError, "ROC AUC score is not defined",
roc_auc_score, y_true, y_pred)
y_true = -np.ones(10, dtype="int")
assert_raise_message(ValueError, "ROC AUC score is not defined",
roc_auc_score, y_true, y_pred)
# y_true contains three different class values
y_true = rng.randint(0, 3, size=10)
assert_raise_message(ValueError, "multiclass format is not supported",
roc_auc_score, y_true, y_pred)
clean_warning_registry()
with warnings.catch_warnings(record=True):
rng = check_random_state(404)
y_pred = rng.rand(10)
# y_true contains only one class value
y_true = np.zeros(10, dtype="int")
assert_raise_message(ValueError, "ROC AUC score is not defined",
roc_auc_score, y_true, y_pred)
y_true = np.ones(10, dtype="int")
assert_raise_message(ValueError, "ROC AUC score is not defined",
roc_auc_score, y_true, y_pred)
y_true = -np.ones(10, dtype="int")
assert_raise_message(ValueError, "ROC AUC score is not defined",
roc_auc_score, y_true, y_pred)
# y_true contains three different class values
y_true = rng.randint(0, 3, size=10)
assert_raise_message(ValueError, "multiclass format is not supported",
roc_auc_score, y_true, y_pred)
def oneplus(fun, random_state=None, cls=None, lambda_=4, max_iter=100,
max_nfev=None, f_tol=0, n_jobs=1, seed=None):
"""
1 + lambda algorithm.
In each generation, create lambda offspring and compare their fitness to the parent individual.
The fittest individual carries over to the next generation. In case of a draw, the offspring is prefered.
:param fun: `callable(individual)`, function to be optimized
:param random_state: an instance of np.random.RandomState, a seed integer or None
:param cls: The base class for individuals
:type cls: (optional) instance of cartesian.cgp.Cartesian
:param seed: (optional) can be passed instead of cls.
:param lambda_: number of offspring per generation
:param max_iter: maximum number of generations
:param max_nfev: maximum number of function evaluations. Important, if fun is another optimizer
:param f_tol: threshold for precision
:param n_jobs: number of jobs for joblib embarrassingly easy parallel
:return: scipy.optimize.OptimizeResult with non-standard attributes
res.x = values for constants
res.expr = expression
res.fun = best value for the function
"""
max_iter = max_nfev if max_nfev else max_iter
max_nfev = max_nfev or math.inf
random_state = check_random_state(random_state)
best = seed or cls.create(random_state=random_state)
best_res = return_opt_result(fun, best)
nfev = best_res.nfev
res = OptimizeResult(expr=best, x=best_res.x, fun=best_res.fun, nit=0, nfev=nfev, success=False)
if best_res.fun <= f_tol:
res["success"] = True
return res
for i in range(1, max_iter):
offspring = [point_mutation(best, random_state=random_state) for _ in range(lambda_)]
# with Parallel(n_jobs=n_jobs) as parallel:
# offspring_fitness = parallel(delayed(return_opt_result)(fun, o) for o in offspring)
offspring_fitness = [return_opt_result(fun, o) for o in offspring]
best, best_res = min(zip(offspring + [best], offspring_fitness + [best_res]), key=lambda x: x[1].fun)
nfev += sum(of.nfev for of in offspring_fitness)
res = OptimizeResult(expr=best, x=best_res.x, fun=best_res.fun, nit=i, nfev=nfev, success=False)
if res.fun <= f_tol:
res["success"] = True
return res
elif res.nfev >= max_nfev:
return res
return res
def test_KNeighborsClassifier_multioutput():
# Test k-NN classifier on multioutput data
rng = check_random_state(0)
n_features = 5
n_samples = 50
n_output = 3
X = rng.rand(n_samples, n_features)
y = rng.randint(0, 3, (n_samples, n_output))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
weights = [None, 'uniform', 'distance', _weight_func]
for algorithm, weights in product(ALGORITHMS, weights):
# Stack single output prediction
y_pred_so = []
y_pred_proba_so = []
for o in range(n_output):
knn = neighbors.KNeighborsClassifier(weights=weights,
algorithm=algorithm)
knn.fit(X_train, y_train[:, o])
y_pred_so.append(knn.predict(X_test))
y_pred_proba_so.append(knn.predict_proba(X_test))
y_pred_so = np.vstack(y_pred_so).T
assert_equal(y_pred_so.shape, y_test.shape)
assert_equal(len(y_pred_proba_so), n_output)
# Multioutput prediction
knn_mo = neighbors.KNeighborsClassifier(weights=weights,
algorithm=algorithm)
knn_mo.fit(X_train, y_train)
y_pred_mo = knn_mo.predict(X_test)
assert_equal(y_pred_mo.shape, y_test.shape)
assert_array_almost_equal(y_pred_mo, y_pred_so)
# Check proba
y_pred_proba_mo = knn_mo.predict_proba(X_test)
assert_equal(len(y_pred_proba_mo), n_output)
for proba_mo, proba_so in zip(y_pred_proba_mo, y_pred_proba_so):
assert_array_almost_equal(proba_mo, proba_so)
def test_distribution():
rng = check_random_state(12321)
# Single variable with 4 values
X = rng.randint(0, 4, size=(1000, 1))
y = rng.rand(1000)
n_trees = 500
clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
uniques = defaultdict(int)
for tree in clf.estimators_:
tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
for f, t in zip(tree.tree_.feature,
tree.tree_.threshold))
uniques[tree] += 1
uniques = sorted([(1. * count / n_trees, tree)
for tree, count in uniques.items()])
# On a single variable problem where X_0 has 4 equiprobable values, there
# are 5 ways to build a random tree. The more compact (0,1/0,0/--0,2/--) of
# them has probability 1/3 while the 4 others have probability 1/6.
assert_equal(len(uniques), 5)
assert_greater(0.20, uniques[0][0]) # Rough approximation of 1/6.
assert_greater(0.20, uniques[1][0])
assert_greater(0.20, uniques[2][0])
assert_greater(0.20, uniques[3][0])
assert_greater(uniques[4][0], 0.3)
assert_equal(uniques[4][1], "0,1/0,0/--0,2/--")
# Two variables, one with 2 values, one with 3 values
X = np.empty((1000, 2))
X[:, 0] = np.random.randint(0, 2, 1000)
X[:, 1] = np.random.randint(0, 3, 1000)
y = rng.rand(1000)
clf = ExtraTreesRegressor(n_estimators=100, max_features=1,
random_state=1).fit(X, y)
uniques = defaultdict(int)
for tree in clf.estimators_:
tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
for f, t in zip(tree.tree_.feature,
tree.tree_.threshold))
uniques[tree] += 1
uniques = [(count, tree) for tree, count in uniques.items()]
assert_equal(len(uniques), 8)
def make_prediction(dataset=None, binary=False):
"""Make some classification predictions on a toy dataset using a SVC
If binary is True restrict to a binary classification problem instead of a
multiclass classification problem
"""
if dataset is None:
# import some data to play with
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
if binary:
# restrict to a binary classification task
X, y = X[y < 2], y[y < 2]
n_samples, n_features = X.shape
p = np.arange(n_samples)
rng = check_random_state(37)
rng.shuffle(p)
X, y = X[p], y[p]
half = int(n_samples / 2)
# add noisy features to make the problem harder and avoid perfect results
rng = np.random.RandomState(0)
X = np.c_[X, rng.randn(n_samples, 200 * n_features)]
# run classifier, get class probabilities and label predictions
clf = svm.SVC(kernel='linear', probability=True, random_state=0)
probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
if binary:
# only interested in probabilities of the positive case
# XXX: do we really want a special API for the binary case?
probas_pred = probas_pred[:, 1]
y_pred = clf.predict(X[half:])
y_true = y[half:]
return y_true, y_pred, probas_pred
###############################################################################
# Tests
def test_sample_weight_invariance(n_samples=50):
random_state = check_random_state(0)
# binary
random_state = check_random_state(0)
y_true = random_state.randint(0, 2, size=(n_samples, ))
y_pred = random_state.randint(0, 2, size=(n_samples, ))
y_score = random_state.random_sample(size=(n_samples,))
for name in ALL_METRICS:
if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or
name in METRIC_UNDEFINED_BINARY):
continue
metric = ALL_METRICS[name]
if name in THRESHOLDED_METRICS:
yield check_sample_weight_invariance, name, metric, y_true, y_score
else:
yield check_sample_weight_invariance, name, metric, y_true, y_pred
# multiclass
random_state = check_random_state(0)
y_true = random_state.randint(0, 5, size=(n_samples, ))
y_pred = random_state.randint(0, 5, size=(n_samples, ))
y_score = random_state.random_sample(size=(n_samples, 5))
for name in ALL_METRICS:
if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or
name in METRIC_UNDEFINED_BINARY_MULTICLASS):
continue
metric = ALL_METRICS[name]
if name in THRESHOLDED_METRICS:
yield check_sample_weight_invariance, name, metric, y_true, y_score
else:
yield check_sample_weight_invariance, name, metric, y_true, y_pred
# multilabel indicator
_, ya = make_multilabel_classification(n_features=1, n_classes=20,
random_state=0, n_samples=100,
allow_unlabeled=False)
_, yb = make_multilabel_classification(n_features=1, n_classes=20,
random_state=1, n_samples=100,
allow_unlabeled=False)
y_true = np.vstack([ya, yb])
y_pred = np.vstack([ya, ya])
y_score = random_state.randint(1, 4, size=y_true.shape)
for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS +
MULTIOUTPUT_METRICS):
if name in METRICS_WITHOUT_SAMPLE_WEIGHT:
continue
metric = ALL_METRICS[name]
if name in THRESHOLDED_METRICS:
yield (check_sample_weight_invariance, name, metric, y_true,
y_score)
else:
yield (check_sample_weight_invariance, name, metric, y_true,
y_pred)
def make_prediction(dataset=None, binary=False):
"""Make some classification predictions on a toy dataset using a SVC
If binary is True restrict to a binary classification problem instead of a
multiclass classification problem
"""
if dataset is None:
# import some data to play with
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
if binary:
# restrict to a binary classification task
X, y = X[y < 2], y[y < 2]
n_samples, n_features = X.shape
p = np.arange(n_samples)
rng = check_random_state(37)
rng.shuffle(p)
X, y = X[p], y[p]
half = int(n_samples / 2)
# add noisy features to make the problem harder and avoid perfect results
rng = np.random.RandomState(0)
X = np.c_[X, rng.randn(n_samples, 200 * n_features)]
# run classifier, get class probabilities and label predictions
clf = svm.SVC(kernel='linear', probability=True, random_state=0)
probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
if binary:
# only interested in probabilities of the positive case
# XXX: do we really want a special API for the binary case?
probas_pred = probas_pred[:, 1]
y_pred = clf.predict(X[half:])
y_true = y[half:]
return y_true, y_pred, probas_pred
###############################################################################
# Tests