def make_regression_with_outliers(n_samples=50, n_features=20):
rng = np.random.RandomState(0)
# Generate data with outliers by replacing 10% of the samples with noise.
X, y = make_regression(
n_samples=n_samples, n_features=n_features,
random_state=0, noise=0.05)
# Replace 10% of the sample with noise.
num_noise = int(0.1 * n_samples)
random_samples = rng.randint(0, n_samples, num_noise)
X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1]))
return X, y
python类make_regression()的实例源码
def test_make_regression_multitarget():
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
n_targets=3, coef=True, noise=1., random_state=0)
assert_equal(X.shape, (100, 10), "X shape mismatch")
assert_equal(y.shape, (100, 3), "y shape mismatch")
assert_equal(c.shape, (10, 3), "coef shape mismatch")
assert_array_equal(sum(c != 0.0), 3,
"Unexpected number of informative features")
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
def test_mse_solving():
# test the MSE estimate to be sane.
# non-regression test for ignoring off-diagonals of feature covariance,
# testing with nugget that renders covariance useless, only
# using the mean function, with low effective rank of data
gp = GaussianProcess(corr='absolute_exponential', theta0=1e-4,
thetaL=1e-12, thetaU=1e-2, nugget=1e-2,
optimizer='Welch', regr="linear", random_state=0)
X, y = make_regression(n_informative=3, n_features=60, noise=50,
random_state=0, effective_rank=1)
gp.fit(X, y)
assert_greater(1000, gp.predict(X, eval_MSE=True)[1].mean())
def test_insight_regression(self):
candidates = 4
X, y = make_regression(
n_samples=1000, n_features=15, n_informative=candidates,
n_targets=1)
df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"])
dfe = DataFrameExtension(df, numericals=["target"], target="target")
insight = ModelSelectionInsight()
insight.adopt(dfe)
self.assertTrue(insight.score > 0)
print(insight.score)
def test_insight_regression(self):
candidates = 4
X, y = make_regression(
n_samples=1000, n_features=15, n_informative=candidates,
n_targets=1)
df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"])
dfe = DataFrameExtension(df, numericals=["target"], target="target")
insight = FeatureSelectionInsight()
insight.adopt(dfe)
print("selected regressor features {}".format(dfe.ftypes.keys()))
self.assertTrue(candidates <= len(dfe.ftypes) - 1 < candidates * 2) # -1 is target ftype
def genRegressionData(n_samples: int = 100, n_features: int = 2, n_redundant: int = 0, strRel: int = 1,
n_repeated: int = 0, noise: float = 1, random_state: object = None,
partition = None) -> object:
"""Generate synthetic regression data
Parameters
----------
n_samples : int, optional
Number of samples
n_features : int, optional
Number of features
n_redundant : int, optional
Number of features which are part of redundant subsets (weakly relevant)
strRel : int, optional
Number of features which are mandatory for the underlying model (strongly relevant)
n_repeated : int, optional
Number of features which are clones of existing ones.
noise : float, optional
Noise of the created samples around ground truth.
random_state : object, optional
Randomstate object used for generation.
Returns
-------
X : array of shape [n_samples, n_features]
The generated samples.
y : array of shape [n_samples]
The output values (target).
Raises
------
ValueError
Wrong parameters for specified amonut of features/samples.
"""
_checkParam(**locals())
random_state = check_random_state(random_state)
X = np.zeros((int(n_samples), int(n_features)))
# Find partitions which defíne the weakly relevant subsets
if partition is None:
# Legacy behaviour yielding subsets of size 2
partition = int(n_redundant / 2) * [2]
part_size = len(partition)
X_informative, Y = make_regression(n_features=int(strRel + part_size),
n_samples=int(n_samples),
noise=noise,
n_informative=int(strRel),
random_state=random_state,
shuffle=False)
X = _fillVariableSpace(**locals())
return X, Y
def test_sparse_regression():
# Check regression with sparse input.
class CustomSVR(SVR):
"""SVR variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super(CustomSVR, self).fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
dok_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
# Trained on sparse format
sparse_classifier = AdaBoostRegressor(
base_estimator=CustomSVR(),
random_state=1
).fit(X_train_sparse, y_train)
# Trained on dense format
dense_classifier = dense_results = AdaBoostRegressor(
base_estimator=CustomSVR(),
random_state=1
).fit(X_train, y_train)
# predict
sparse_results = sparse_classifier.predict(X_test_sparse)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
# staged_predict
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
dense_results = dense_classifier.staged_predict(X_test)
for sprase_res, dense_res in zip(sparse_results, dense_results):
assert_array_equal(sprase_res, dense_res)
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([(t == csc_matrix or t == csr_matrix)
for t in types])