def setUp(self):
os.putenv("KMP_DUPLICATE_LIB_OK", "TRUE")
self.X_class, self.y_class = datasets.make_classification(random_state=42)
self.X_reg, self.y_reg = datasets.make_regression(random_state=42)
self.classification_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
self.regression_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
self.class_scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
self.reg_scorer = Scorer("mse", metrics.mean_squared_error)
self.classification_task_split = \
Task("class_split", self.X_class, self.y_class, "classification", test_size=0.1, random_state=42)
self.regression_task_split = \
Task("reg_split", self.X_class, self.y_class, "regression", test_size=0.1, random_state=42)
self.classification_task_cv = \
Task("class_cv", self.X_reg, self.y_reg, "classification", cv=5, random_state=42)
self.regression_task_cv = \
Task("reg_cv", self.X_reg, self.y_reg, "regression", cv=5, random_state=42)
python类make_regression()的实例源码
def test_cv():
"""Simple CV check."""
# XXX: don't use scikit-learn for tests.
X, y = make_regression()
cv = KFold(X.shape[0], 5)
glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
# check that it returns 5 scores
scores = cross_val_score(glm_normal, X, y, cv=cv)
assert_equal(len(scores), 5)
param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
{'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
10, base=np.exp(1))}]
glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
glmcv.fit(X, y)
def test_min_samples_split():
X_c, y_c = load_digits(return_X_y=True)
X_r, y_r = make_regression(n_samples=10000, random_state=0)
for mss in [2, 4, 10, 20]:
mtr = MondrianTreeRegressor(random_state=0, min_samples_split=mss)
mtr.partial_fit(X_r[: X_r.shape[0] // 2], y_r[: X_r.shape[0] // 2])
mtr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:])
n_node_samples = mtr.tree_.n_node_samples[mtr.tree_.children_left != -1]
assert_greater(np.min(n_node_samples) + 1, mss)
mtc = MondrianTreeClassifier(random_state=0, min_samples_split=mss)
mtc.partial_fit(X_c[: X_c.shape[0] // 2], y_c[: X_c.shape[0] // 2])
mtc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:])
n_node_samples = mtc.tree_.n_node_samples[mtc.tree_.children_left != -1]
assert_greater(np.min(n_node_samples) + 1, mss)
def test_min_samples_split():
X_c, y_c = load_digits(return_X_y=True)
X_r, y_r = make_regression(n_samples=10000, random_state=0)
for mss in [2, 4, 10, 20]:
mfr = MondrianForestRegressor(random_state=0, min_samples_split=mss)
mfr.partial_fit(X_r[: X_r.shape[0] // 2], y_r[: X_r.shape[0] // 2])
mfr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:])
for est in mfr.estimators_:
n_node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
assert_greater(np.min(n_node_samples) + 1, mss)
mfc = MondrianForestClassifier(random_state=0, min_samples_split=mss)
mfc.partial_fit(X_c[: X_c.shape[0] // 2], y_c[: X_c.shape[0] // 2])
mfc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:])
for est in mfc.estimators_:
n_node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
assert_greater(np.min(n_node_samples) + 1, mss)
def regression():
# Generate a random regression problem
X, y = make_regression(n_samples=5000, n_features=25, n_informative=25,
n_targets=1, random_state=100, noise=0.05)
y *= 0.01
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
random_state=1111)
model = NeuralNet(
layers=[
Dense(64, Parameters(init='normal')),
Activation('linear'),
Dense(32, Parameters(init='normal')),
Activation('linear'),
Dense(1),
],
loss='mse',
optimizer=Adam(),
metric='mse',
batch_size=256,
max_epochs=15,
)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("regression mse", mean_squared_error(y_test, predictions.flatten()))
def test_get_errors_param(self):
"""
Test known models we can get the cv errors for alpha selection
"""
# Test original CV models
for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV):
try:
model = AlphaSelection(model())
X, y = make_regression()
model.fit(X, y)
errors = model._find_errors_param()
self.assertTrue(len(errors) > 0)
except YellowbrickValueError:
self.fail("could not find errors on {}".format(model.name))
def test_cross_val_score_with_score_func_regression():
X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
random_state=0)
reg = Ridge()
# Default score of the Ridge regression estimator
scores = cross_val_score(reg, X, y, cv=5)
assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# R2 score (aka. determination coefficient) - should be the
# same as the default estimator score
r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5)
assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# Mean squared error; this is a loss function, so "scores" are negative
mse_scores = cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error")
expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
assert_array_almost_equal(mse_scores, expected_mse, 2)
# Explained variance
scoring = make_scorer(explained_variance_score)
ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring)
assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_cross_val_score_with_score_func_regression():
X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
random_state=0)
reg = Ridge()
# Default score of the Ridge regression estimator
scores = cval.cross_val_score(reg, X, y, cv=5)
assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# R2 score (aka. determination coefficient) - should be the
# same as the default estimator score
r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# Mean squared error; this is a loss function, so "scores" are negative
mse_scores = cval.cross_val_score(reg, X, y, cv=5,
scoring="mean_squared_error")
expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
assert_array_almost_equal(mse_scores, expected_mse, 2)
# Explained variance
scoring = make_scorer(explained_variance_score)
ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_multi_target_regression():
X, y = datasets.make_regression(n_targets=3)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
references = np.zeros_like(y_test)
for n in range(3):
rgr = GradientBoostingRegressor(random_state=0)
rgr.fit(X_train, y_train[:, n])
references[:,n] = rgr.predict(X_test)
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X_train, y_train)
y_pred = rgr.predict(X_test)
assert_almost_equal(references, y_pred)
def test_ridge_fit_intercept_sparse():
X, y = make_regression(n_samples=1000, n_features=2, n_informative=2,
bias=10., random_state=42)
X_csr = sp.csr_matrix(X)
dense = Ridge(alpha=1., tol=1.e-15, solver='sag', fit_intercept=True)
sparse = Ridge(alpha=1., tol=1.e-15, solver='sag', fit_intercept=True)
dense.fit(X, y)
sparse.fit(X_csr, y)
assert_almost_equal(dense.intercept_, sparse.intercept_)
assert_array_almost_equal(dense.coef_, sparse.coef_)
# test the solver switch and the corresponding warning
sparse = Ridge(alpha=1., tol=1.e-15, solver='lsqr', fit_intercept=True)
assert_warns(UserWarning, sparse.fit, X_csr, y)
assert_almost_equal(dense.intercept_, sparse.intercept_)
assert_array_almost_equal(dense.coef_, sparse.coef_)
def test_make_regression():
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
effective_rank=5, coef=True, bias=0.0,
noise=1.0, random_state=0)
assert_equal(X.shape, (100, 10), "X shape mismatch")
assert_equal(y.shape, (100,), "y shape mismatch")
assert_equal(c.shape, (10,), "coef shape mismatch")
assert_equal(sum(c != 0.0), 3, "Unexpected number of informative features")
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
# Test with small number of features.
X, y = make_regression(n_samples=100, n_features=1) # n_informative=3
assert_equal(X.shape, (100, 1))
def lession_5():
# db = datasets.load_boston()
# print db.data.shape
# data_X=db.data
# data_y=db.target
# model = LinearRegression()
# model.fit(data_X,data_y)
# print model.predict(data_X[:8])
# print data_y[:8]
X,y = datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=10)
plt.scatter(X,y)
plt.show()
figure.classification.vs.regression.py 文件源码
项目:microbiome-summer-school-2017
作者: aldro61
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def make_regression_example(axis, random_state):
X, y = make_regression(n_samples=100, n_features=1, noise=30.0, random_state=random_state)
axis.scatter(X[:, 0], y, color="blue", s=10, label="Patients")
clf = LinearSVR().fit(X, y)
axis.plot(X[:, 0], clf.predict(X), color="black", label="Model")
ax2.tick_params(labelbottom='off', labelleft='off')
ax2.set_xlabel("Gene 1")
ax2.set_ylabel("Survived (years)")
ax2.legend()
def main():
X, y = make_regression(n_samples=100, n_features=1, noise=20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
n_samples, n_features = np.shape(X)
model = LinearRegression(n_iterations=100)
model.fit(X_train, y_train)
# Training error plot
n = len(model.training_errors)
training, = plt.plot(range(n), model.training_errors, label="Training Error")
plt.legend(handles=[training])
plt.title("Error Plot")
plt.ylabel('Mean Squared Error')
plt.xlabel('Iterations')
plt.show()
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print ("Mean squared error: %s" % (mse))
y_pred_line = model.predict(X)
# Color map
cmap = plt.get_cmap('viridis')
# Plot the results
m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10)
m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10)
plt.plot(366 * X, y_pred_line, color='black', linewidth=2, label="Prediction")
plt.suptitle("Linear Regression")
plt.title("MSE: %.2f" % mse, fontsize=10)
plt.xlabel('Day')
plt.ylabel('Temperature in Celcius')
plt.legend((m1, m2), ("Training data", "Test data"), loc='lower right')
plt.show()
def test_tau():
"""
Test time of split for the root.
"""
X, y = make_regression(random_state=0, n_features=10)
y = np.round(y)
rate = np.sum(np.max(X, axis=0) - np.min(X, axis=0))
for est in estimators:
est = est.set_params(max_depth=1)
taus = []
for random_state in np.arange(100):
est.set_params(random_state=random_state).fit(X, y)
taus.append(est.tree_.tau[0])
assert_almost_equal(np.mean(taus), 1.0 / rate, 2)
def test_mondrian_tree_n_node_samples():
for r in range(1000):
X, y = make_regression(n_samples=2, random_state=r)
mtr = MondrianTreeRegressor(random_state=0)
mtr.partial_fit(X, y)
assert_array_equal(mtr.tree_.n_node_samples, [1, 1, 2])
def test_partial_fit_equivalence():
X, y = make_regression(random_state=0, n_samples=100)
mtr = MondrianTreeRegressor(random_state=0)
mtr.partial_fit(X, y)
for batch_size in [10, 20, 25, 50, 90]:
check_partial_fit_equivalence(batch_size, mtr, 0, X, y)
X, y = make_classification(random_state=0, n_samples=100)
mtc = MondrianTreeClassifier(random_state=0)
mtc.partial_fit(X, y)
for batch_size in [10, 20, 25, 50, 90]:
check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
def test_partial_fit_n_samples_1000():
mtc = MondrianTreeClassifier(random_state=0)
X, y = load_digits(return_X_y=True)
check_online_fit(mtc, X, y, 20)
mtc = MondrianTreeClassifier(random_state=0)
check_online_fit(mtc, X, y, 100)
X, y = make_regression(random_state=0, n_samples=10000)
mtr = MondrianTreeRegressor(random_state=0)
check_online_fit(mtr, X, y, 100, is_clf=False)
mtr = MondrianTreeRegressor(random_state=0)
check_online_fit(mtr, X, y, 20, is_clf=False)
def test_multioutput_regression():
"""Test whether multi-output regression works as expected."""
X, y = make_regression(n_samples=200, n_targets=5,
random_state=random_state)
for activation in ACTIVATION_TYPES:
elm = ELMRegressor(n_hidden=300, activation=activation,
random_state=random_state)
elm.fit(X, y)
assert_greater(elm.score(X, y), 0.95)
def test_known_values(self):
from sklearn.datasets import make_regression
X,y, coef = make_regression(200, 15, 15, coef=True)
np.testing.assert_equal(relevant_features(X, y),
coef != 0.0)
def regression():
# Generate a random regression problem
X, y = make_regression(n_samples=500, n_features=5,
n_informative=5, n_targets=1,
noise=0.05, random_state=1111, bias=0.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=1111)
model = knn.KNNRegressor(k=5, distance_func=distance.euclidean)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print('regression mse', mean_squared_error(y_test, predictions))
def regression():
# Generate a random regression problem
X, y = make_regression(n_samples=500, n_features=5, n_informative=5,
n_targets=1, noise=0.05, random_state=1111,
bias=0.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
random_state=1111)
model = GradientBoostingRegressor(n_estimators=25, max_depth=5,
max_features=3, )
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print('regression, mse: %s'
% mean_squared_error(y_test.flatten(), predictions.flatten()))
def regression():
# Generate a random regression problem
X, y = make_regression(n_samples=10000, n_features=100,
n_informative=75, n_targets=1, noise=0.05,
random_state=1111, bias=0.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=1111)
model = LinearRegression(lr=0.01, max_iters=2000, penalty='l2', C=0.03)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print('regression mse', mean_squared_error(y_test, predictions))
def regression():
# Generate a random regression problem
X, y = make_regression(n_samples=500, n_features=5, n_informative=5,
n_targets=1, noise=0.05, random_state=1111,
bias=0.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
random_state=1111)
model = RandomForestRegressor(n_estimators=50, max_depth=10, max_features=3, )
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print('regression, mse: %s'
% mean_squared_error(y_test.flatten(), predictions.flatten()))
def test_get_alphas_param_lassolars(self):
"""
Assert that we can get alphas from lasso lars.
"""
X, y = make_regression()
model = AlphaSelection(LassoLarsCV())
model.fit(X, y)
try:
malphas = model._find_alphas_param()
self.assertTrue(len(malphas) > 0)
except YellowbrickValueError:
self.fail("could not find alphas on {}".format(model.name))
def create_regression_dataset(n_samples, n_features, n_informative, effective_rank, tail_strength,
noise, random_state=None):
"""
Creates a regression dataset
:param n_samples: number of observations
:param n_features: number of features
:param n_informative: number of informative features
:param n_targets: The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar.
:param effective_rank: approximate number of singular vectors required to explain data
:param tail_strength: relative importance of the fat noisy tail of the singular values profile
:param noise: standard deviation of the gaussian noise applied to the output
:param random_state: the numpy RandomState
:return: the requested dataframe
"""
random_state = get_random_state(random_state)
X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
n_targets=1, effective_rank=effective_rank, tail_strength=tail_strength,
noise=noise, random_state=random_state)
# cast to a data frame
df = pd.DataFrame(X)
# rename X columns
df = rename_columns(df)
# and add the Y
df['y'] = y
return df
def test_Symbolic_fit(n_out):
x, y = make_regression(n_features=2, n_informative=1, n_targets=n_out)
est = Symbolic(max_nfev=1, lambda_=1).fit(x, y)
yhat = est.predict(x)
assert yhat.shape == y.shape
def test_Symbolic_joblib():
x, y = make_regression(n_features=2, n_informative=1, n_targets=1)
yhat = Symbolic(n_jobs=-1, max_nfev=1, lambda_=1).fit(x, y).predict(x)
assert yhat.shape == y.shape
def data(self):
X, y = make_regression(
1000, 20, n_informative=10, bias=0, random_state=0)
X, y = X.astype(np.float32), y.astype(np.float32).reshape(-1, 1)
Xt = StandardScaler().fit_transform(X)
yt = StandardScaler().fit_transform(y)
return Xt, yt
def test_multi_target_regression_one_target():
# Test multi target regression raises
X, y = datasets.make_regression(n_targets=1)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
assert_raises(ValueError, rgr.fit, X_train, y_train)