def cross_validate_best_known():
'''
import and clean the tractor data, then do a corss validation on each of the three models we are
training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print
the scores.
The parameters we're using here are the "best" that we've found so far using a grid search.
'''
tractor_data = pd.read_csv('data/train.csv')
tractor_data = cln.clean_all(tractor_data)
X = tractor_data
y = tractor_data.pop('SalePrice')
rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2)
gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1)
ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3)
ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000)
validate.cross_v_scores([rf, gb, ab], X, y)
# RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946
# GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829
# AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393
python类GradientBoostingRegressor()的实例源码
def model_gradient_boosting_tree(Xtrain,Xtest,ytrain):
X_train = Xtrain
y_train = ytrain
gbr = GradientBoostingRegressor(random_state=0)
param_grid = {
'n_estimators': [800,1500],
'max_features': [20,15],
'max_depth': [8,10],
'learning_rate': [0.1],
'subsample': [1]
}
model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
model.fit(X_train, y_train)
print('Gradient boosted tree regression...')
print('Best Params:')
print(model.best_params_)
print('Best CV Score:')
print(-model.best_score_)
y_pred = model.predict(Xtest)
return y_pred, -model.best_score_
# read data, build model and do prediction
def unscaled_pipelines():
# Random forest parameters
random_forest_kwargs = {
'n_estimators': 10,
'criterion': 'mse',
'random_state': _RANDOM_STATE,
'n_jobs': cpu_count(),
'verbose': True,
}
# Gradient boosting parameters
gradient_boost_kwargs = {
'random_state': _RANDOM_STATE,
'verbose': 1,
}
models = [
DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE),
# RandomForestRegressor(**random_forest_kwargs),
# GradientBoostingRegressor(**gradient_boost_kwargs),
]
pipelines = []
for m in models:
# Steps
pipelines.append(make_pipeline(m))
return pipelines
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
def test_gbrt_base_estimator():
rng = np.random.RandomState(1)
N = 10000
X = np.ones((N, 1))
y = rng.normal(size=N)
base = RandomForestRegressor()
rgr = GradientBoostingQuantileRegressor(base_estimator=base)
assert_raise_message(ValueError, 'type GradientBoostingRegressor',
rgr.fit, X, y)
base = GradientBoostingRegressor()
rgr = GradientBoostingQuantileRegressor(base_estimator=base)
assert_raise_message(ValueError, 'quantile loss', rgr.fit, X, y)
base = GradientBoostingRegressor(loss='quantile', n_estimators=20)
rgr = GradientBoostingQuantileRegressor(base_estimator=base)
rgr.fit(X, y)
estimates = rgr.predict(X, return_quantiles=True)
assert_almost_equal(stats.norm.ppf(rgr.quantiles),
np.mean(estimates, axis=0),
decimal=2)
def fs_boruta(df):
# do feature selection using boruta
X = df[[x for x in df.columns if x!='SalePrice']]
y = df['SalePrice']
model = GradientBoostingRegressor()
feat_selector = boruta_py.BorutaPy(model, n_estimators=100, verbose=12)
# find all relevant features
feat_selector.fit_transform(X.as_matrix(), y.as_matrix())
# check selected features
features_bool = np.array(feat_selector.support_)
features = np.array(X.columns)
result = features[features_bool]
#print(result)
# check ranking of features
features_rank = feat_selector.ranking_
#print(features_rank)
rank = features_rank[features_bool]
#print(rank)
return result
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def __init__(self, nr_events, case_id_col, encoder_kwargs, cls_kwargs, cls_method="rf"):
self.case_id_col = case_id_col
self.nr_events = nr_events
self.encoder = SequenceEncoder(nr_events=nr_events, case_id_col=case_id_col, **encoder_kwargs)
if cls_method == "gbm":
self.cls = GradientBoostingRegressor(**cls_kwargs)
elif cls_method == "rf":
self.cls = RandomForestRegressor(**cls_kwargs)
else:
print("Classifier method not known")
def grid_search(X, y, split, learn=[.01], samples_leaf=[250, 350, 500],
depth=[10, 15]):
'''
Runs a grid search for GBM on split data
'''
for l in learn:
for s in samples_leaf:
for d in depth:
model = GradientBoostingRegressor(n_estimators=250,
learning_rate=l,
min_samples_leaf=s,
max_depth=d,
random_state=42)
model.fit(X.values[:split], y.values[:split])
in_score = model.score(X.values[:split], y.values[:split])
out_score = model.score(X.values[split:], y.values[split:])
print 'learning_rate: {}, min_samples_leaf: {}, max_depth: {}'.\
format(l, s, d)
print 'in-sample score:', in_score
print 'out-sample score:', out_score
print ''
def __init__(self, q1=.16, q2=.84,**params):
"""
Gradient boosted trees as surrogate model for Bayesian Optimization.
Uses quantile regression for an estimate of the 'posterior' variance.
In practice, the std is computed as (`q2` - `q1`) / 2.
Relies on `sklearn.ensemble.GradientBoostingRegressor`
Parameters
----------
q1: float
First quantile.
q2: float
Second quantile
params: tuple
Extra parameters to pass to `GradientBoostingRegressor`
"""
self.params = params
self.q1 = q1
self.q2 = q2
self.eps = 1e-1
def fit(self, X, y):
"""
Fit a GBM model to data `X` and targets `y`.
Parameters
----------
X : array-like
Input values.
y: array-like
Target values.
"""
self.X = X
self.y = y
self.n = self.X.shape[0]
self.modq1 = GradientBoostingRegressor(loss='quantile', alpha=self.q1, **self.params)
self.modq2 = GradientBoostingRegressor(loss='quantile', alpha=self.q2, **self.params)
self.mod = GradientBoostingRegressor(loss = 'ls', **self.params)
self.modq1.fit(self.X, self.y)
self.modq2.fit(self.X, self.y)
self.mod.fit(self.X, self.y)
def test_boston_OHE_plus_trees(self):
data = load_boston()
pl = Pipeline([
("OHE", OneHotEncoder(categorical_features = [8], sparse=False)),
("Trees",GradientBoostingRegressor(random_state = 1))])
pl.fit(data.data, data.target)
# Convert the model
spec = convert(pl, data.feature_names, 'target')
# Get predictions
df = pd.DataFrame(data.data, columns=data.feature_names)
df['prediction'] = pl.predict(data.data)
# Evaluate it
result = evaluate_regressor(spec, df, 'target', verbose = False)
assert result["max_error"] < 0.0001
def gbdt_select_model(file_name):
train_df = read_from_file(file_name)
#featrue 16
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Select Model...'
start_time = datetime.datetime.now()
gbdt = GradientBoostingRegressor()
parameters = {'n_estimators': [100, 120], 'max_depth':[4, 5, 6]}
grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.datetime.now()
print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def test():
iris = load_iris()
#print iris
#print iris['target'].shape
gbdt=GradientBoostingRegressor(n_estimators=1000, max_depth=4)
gbdt.fit(iris.data[:120],iris.target[:120])
#Save GBDT Model
joblib.dump(gbdt, 'GBDT.model')
predict = gbdt.predict(iris.data[:120])
total_err = 0
for i in range(len(predict)):
print predict[i],iris.target[i]
err = predict[i] - iris.target[i]
total_err += err * err
print 'Training Error: %f' % (total_err / len(predict))
pred = gbdt.predict(iris.data[120:])
error = 0
for i in range(len(pred)):
print pred[i],iris.target[i+120]
err = pred[i] - iris.target[i+120]
error += err * err
print 'Test Error: %f' % (error / len(pred))
def select_model(file_name):
train_df = read_from_file(file_name)
#featrue 16
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Select Model...'
start_time = datetime.datetime.now()
gbdt = GradientBoostingRegressor()
parameters = {'n_estimators': [10000, 12000], 'max_depth':[16,15, 14]}
grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.datetime.now()
print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def generate_GBDT_model(file_name):
train_df = read_from_file(file_name)
#featrue 18
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Train Gradient Boosting Regression Model...'
start_time = datetime.datetime.now()
gbdt = GradientBoostingRegressor(n_estimators=120, max_depth=10) #, class_weight='balanced')
gbdt.fit(X,y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: '
print (end_time - start_time).seconds
print 'Save Model...'
joblib.dump(gbdt, 'GBDT.model')
return gbdt
def GDBT_regression(X=train_df_munged,Y=label_df['SalePrice']):
est = GradientBoostingRegressor(n_estimators=50,max_depth=3,learning_rate=0.1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
est.fit(X_train,Y_train)
y_train_pred = est.predict(X_test)
plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data')
plt.title("Linear regression with GDBT")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
plt.show()
# Plot predictions
plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data")
plt.title("Linear regression with GDBT")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc="upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
plt.show()
print('rmse value:',rmse(Y_test,y_train_pred))
return est
def train_model(self, train_file_path, model_path):
print("==> Load the data ...")
X_train, Y_train = self.load_file(train_file_path)
print(train_file_path, shape(X_train))
print("==> Train the model ...")
min_max_scaler = preprocessing.MaxAbsScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
clf = GradientBoostingRegressor(n_estimators=self.n_estimators)
clf.fit(X_train_minmax.toarray(), Y_train)
print("==> Save the model ...")
pickle.dump(clf, open(model_path, 'wb'))
scaler_path = model_path.replace('.pkl', '.scaler.pkl')
pickle.dump(min_max_scaler, open(scaler_path, 'wb'))
return clf
def test_GradientBoostingRegressor_num(*data):
'''
test the performance with different n_estimators
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
nums=np.arange(1,200,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for num in nums:
regr=ensemble.GradientBoostingRegressor(n_estimators=num)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(nums,training_scores,label="Training Score")
ax.plot(nums,testing_scores,label="Testing Score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1.05)
plt.suptitle("GradientBoostingRegressor")
plt.show()
def test_GradientBoostingRegressor_maxdepth(*data):
'''
test the performance with different max_depth
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
maxdepths=np.arange(1,20)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for maxdepth in maxdepths:
regr=ensemble.GradientBoostingRegressor(max_depth=maxdepth,max_leaf_nodes=None)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(maxdepths,training_scores,label="Training Score")
ax.plot(maxdepths,testing_scores,label="Testing Score")
ax.set_xlabel("max_depth")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1.05)
plt.suptitle("GradientBoostingRegressor")
plt.show()
def test_GradientBoostingRegressor_learning(*data):
'''
test the performance with different learning rate
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
learnings=np.linspace(0.01,1.0)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for learning in learnings:
regr=ensemble.GradientBoostingRegressor(learning_rate=learning)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(learnings,training_scores,label="Training Score")
ax.plot(learnings,testing_scores,label="Testing Score")
ax.set_xlabel("learning_rate")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1.05)
plt.suptitle("GradientBoostingRegressor")
plt.show()
def test_GradientBoostingRegressor_subsample(*data):
'''
test the performance with different subsample
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
subsamples=np.linspace(0.01,1.0,num=20)
testing_scores=[]
training_scores=[]
for subsample in subsamples:
regr=ensemble.GradientBoostingRegressor(subsample=subsample)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(subsamples,training_scores,label="Training Score")
ax.plot(subsamples,testing_scores,label="Training Score")
ax.set_xlabel("subsample")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1.05)
plt.suptitle("GradientBoostingRegressor")
plt.show()
def test_GradientBoostingRegressor_max_features(*data):
'''
test the performance with different max_features
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
max_features=np.linspace(0.01,1.0)
testing_scores=[]
training_scores=[]
for features in max_features:
regr=ensemble.GradientBoostingRegressor(max_features=features)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(max_features,training_scores,label="Training Score")
ax.plot(max_features,testing_scores,label="Training Score")
ax.set_xlabel("max_features")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1.05)
plt.suptitle("GradientBoostingRegressor")
plt.show()
def test_feature_importances():
X = np.array(boston.data, dtype=np.float32)
y = np.array(boston.target, dtype=np.float32)
for presort in True, False:
clf = GradientBoostingRegressor(n_estimators=100, max_depth=5,
min_samples_split=2, random_state=1,
presort=presort)
clf.fit(X, y)
assert_true(hasattr(clf, 'feature_importances_'))
# XXX: Remove this test in 0.19 after transform support to estimators
# is removed.
X_new = assert_warns(
DeprecationWarning, clf.transform, X, threshold="mean")
assert_less(X_new.shape[1], X.shape[1])
feature_mask = (
clf.feature_importances_ > clf.feature_importances_.mean())
assert_array_almost_equal(X_new, X[:, feature_mask])
def test_staged_predict():
# Test whether staged decision function eventually gives
# the same prediction.
X, y = datasets.make_friedman1(n_samples=1200,
random_state=1, noise=1.0)
X_train, y_train = X[:200], y[:200]
X_test = X[200:]
clf = GradientBoostingRegressor()
# test raise ValueError if not fitted
assert_raises(ValueError, lambda X: np.fromiter(
clf.staged_predict(X), dtype=np.float64), X_test)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# test if prediction for last stage equals ``predict``
for y in clf.staged_predict(X_test):
assert_equal(y.shape, y_pred.shape)
assert_array_equal(y_pred, y)
def test_staged_functions_defensive():
# test that staged_functions make defensive copies
rng = np.random.RandomState(0)
X = rng.uniform(size=(10, 3))
y = (4 * X[:, 0]).astype(np.int) + 1 # don't predict zeros
for estimator in [GradientBoostingRegressor(),
GradientBoostingClassifier()]:
estimator.fit(X, y)
for func in ['predict', 'decision_function', 'predict_proba']:
staged_func = getattr(estimator, "staged_" + func, None)
if staged_func is None:
# regressor has no staged_predict_proba
continue
with warnings.catch_warnings(record=True):
staged_result = list(staged_func(X))
staged_result[1][:] = 0
assert_true(np.all(staged_result[0] != 0))
def test_warm_start_oob():
# Test if warm start OOB equals fit.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
est = Cls(n_estimators=200, max_depth=1, subsample=0.5,
random_state=1)
est.fit(X, y)
est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5,
random_state=1, warm_start=True)
est_ws.fit(X, y)
est_ws.set_params(n_estimators=200)
est_ws.fit(X, y)
assert_array_almost_equal(est_ws.oob_improvement_[:100],
est.oob_improvement_[:100])
def test_multi_target_sample_weights():
# weighted regressor
Xw = [[1,2,3], [4,5,6]]
yw = [[3.141, 2.718], [2.718, 3.141]]
w = [2., 1.]
rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr_w.fit(Xw, yw, w)
# unweighted, but with repeated samples
X = [[1,2,3], [1,2,3], [4,5,6]]
y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X, y)
X_test = [[1.5,2.5,3.5], [3.5,4.5,5.5]]
assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def try_params( n_iterations, params ):
n_estimators = int( round( n_iterations * trees_per_iteration ))
print "n_estimators:", n_estimators
pprint( params )
clf = GB( n_estimators = n_estimators, verbose = 0, **params )
return train_and_eval_sklearn_regressor( clf, data )