def test_regression():
# Check regression for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"max_features": [0.5, 1.0],
"bootstrap": [True, False],
"bootstrap_features": [True, False]})
for base_estimator in [None,
DummyRegressor(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
SVR()]:
for params in grid:
BaggingRegressor(base_estimator=base_estimator,
random_state=rng,
**params).fit(X_train, y_train).predict(X_test)
python类BaggingRegressor()的实例源码
def get_feature_importance(list_of_features):
n_estimators=10000
random_state=0
n_jobs=4
x_train=data_frame[list_of_features]
y_train=data_frame.iloc[:,-1]
feat_labels= data_frame.columns[1:]
forest = BaggingRegressor(n_estimators=n_estimators,random_state=random_state,n_jobs=n_jobs)
forest.fit(x_train,y_train)
importances=forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]],
importances[indices[f]]))
plt.title("Feature Importance")
plt.bar(range(x_train.shape[1]),importances[indices],color='lightblue',align='center')
plt.xticks(range(x_train.shape[1]),feat_labels[indices],rotation=90)
plt.xlim([-1,x_train.shape[1]])
plt.tight_layout()
plt.show()
def test_bootstrap_features():
# Test that bootstrapping features may generate duplicate features.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data,
boston.target,
random_state=rng)
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_features=1.0,
bootstrap_features=False,
random_state=rng).fit(X_train, y_train)
for features in ensemble.estimators_features_:
assert_equal(boston.data.shape[1], np.unique(features).shape[0])
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_features=1.0,
bootstrap_features=True,
random_state=rng).fit(X_train, y_train)
for features in ensemble.estimators_features_:
assert_greater(boston.data.shape[1], np.unique(features).shape[0])
def test_parallel_regression():
# Check parallel regression.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data,
boston.target,
random_state=rng)
ensemble = BaggingRegressor(DecisionTreeRegressor(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)
ensemble = BaggingRegressor(DecisionTreeRegressor(),
n_jobs=1,
random_state=0).fit(X_train, y_train)
y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def create_model(list_of_features):
n_estimators=10000
n_jobs=4
x_train=data_frame[list_of_features]
y_train=data_frame.iloc[:,-1]
x_test=data_frame_test[list_of_features]
random_state=0
forest=BaggingRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=n_estimators,random_state=random_state, n_jobs=n_jobs)
forest.fit(x_train[list_of_features],y_train)
Y_pred=forest.predict(data_frame_test[list_of_features].as_matrix())
i=0
file=open('submission.csv','w')
header="Id,SalePrice"
header=header+'\n'
file.write(header)
for id in (data_frame_test['Id']):
str="{},{}".format(id,Y_pred[i])
str=str+'\n'
file.write(str)
i+=1
def setClf(self):
# min_samples_split = 3
self.clf = BaggingRegressor(n_estimators = 100, max_samples =0.5, max_features =0.5, verbose = 100)
return
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def spot_check(X, y):
if type == 'regression':
models = [
(LinearRegression(), 'Ordinary Least Squares'),
(Ridge(alpha=0.1), 'Ridge (alpha 0.1)'),
(Ridge(), 'Ridge (alpha 1.0)'),
(Lasso(alpha=0.1), 'Lasso (alpha 0.1)'),
(Lasso(), 'Lasso (alpha 1.0)'),
(ElasticNet(alpha=0.1), 'ElasticNet (alpha 0.1)'),
(ElasticNet(), 'ElasticNet (alpha 1.0)'),
(DecisionTreeRegressor(), 'Decision Tree'),
(KNeighborsRegressor(), 'K-Nearest Neighbors'),
# (RandomForestRegressor(), 'Random Forest Regressor'),
# (BaggingRegressor(), 'Bagging Regressor'),
# (GradientBoostingRegressor(), 'Gradient Bosted Regression'),
# (SVR(), 'Support Vector Regression')
]
splits = 5
scores = []
for model, model_name in models:
score = check_model(model, splits, X, y)
# get average score
scores.append(score)
model_names = map(lambda x: x[1], models)
for name, score in zip(model_names, scores):
print('%s: %f' % (name, score))
def test_bootstrap_samples():
# Test that bootstrapping samples generate non-perfect base estimators.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data,
boston.target,
random_state=rng)
base_estimator = DecisionTreeRegressor().fit(X_train, y_train)
# without bootstrap, all trees are perfect on the training set
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_samples=1.0,
bootstrap=False,
random_state=rng).fit(X_train, y_train)
assert_equal(base_estimator.score(X_train, y_train),
ensemble.score(X_train, y_train))
# with bootstrap, trees are no longer perfect on the training set
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_samples=1.0,
bootstrap=True,
random_state=rng).fit(X_train, y_train)
assert_greater(base_estimator.score(X_train, y_train),
ensemble.score(X_train, y_train))
def test_oob_score_regression():
# Check that oob prediction is a good estimation of the generalization
# error.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data,
boston.target,
random_state=rng)
clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
n_estimators=50,
bootstrap=True,
oob_score=True,
random_state=rng).fit(X_train, y_train)
test_score = clf.score(X_test, y_test)
assert_less(abs(test_score - clf.oob_score_), 0.1)
# Test with few estimators
assert_warns(UserWarning,
BaggingRegressor(base_estimator=DecisionTreeRegressor(),
n_estimators=1,
bootstrap=True,
oob_score=True,
random_state=rng).fit,
X_train,
y_train)
def test_sparse_regression():
# Check regression for various parameter settings on sparse input.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
class CustomSVR(SVR):
"""SVC variant that records the nature of the training set"""
def fit(self, X, y):
super(CustomSVR, self).fit(X, y)
self.data_type_ = type(X)
return self
parameter_sets = [
{"max_samples": 0.5,
"max_features": 2,
"bootstrap": True,
"bootstrap_features": True},
{"max_samples": 1.0,
"max_features": 4,
"bootstrap": True,
"bootstrap_features": True},
{"max_features": 2,
"bootstrap": False,
"bootstrap_features": True},
{"max_samples": 0.5,
"bootstrap": True,
"bootstrap_features": False},
]
for sparse_format in [csc_matrix, csr_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
for params in parameter_sets:
# Trained on sparse format
sparse_classifier = BaggingRegressor(
base_estimator=CustomSVR(),
random_state=1,
**params
).fit(X_train_sparse, y_train)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_results = BaggingRegressor(
base_estimator=CustomSVR(),
random_state=1,
**params
).fit(X_train, y_train).predict(X_test)
sparse_type = type(X_train_sparse)
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert_array_equal(sparse_results, dense_results)
assert all([t == sparse_type for t in types])
assert_array_equal(sparse_results, dense_results)
def test_base_estimator():
# Check base_estimator and its default values.
rng = check_random_state(0)
# Classification
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=rng)
ensemble = BaggingClassifier(None,
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))
ensemble = BaggingClassifier(DecisionTreeClassifier(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))
ensemble = BaggingClassifier(Perceptron(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, Perceptron))
# Regression
X_train, X_test, y_train, y_test = train_test_split(boston.data,
boston.target,
random_state=rng)
ensemble = BaggingRegressor(None,
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))
ensemble = BaggingRegressor(DecisionTreeRegressor(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))
ensemble = BaggingRegressor(SVR(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, SVR))