def add_new_weak_learner(self):
'''
Summary:
Adds a new function, h, to self.weak_learners by solving for Eq. 1 using multiple additive regression trees:
[Eq. 1] h = argmin_h (sum_i Q_A(s_i,a_i) + h(s_i, a_i) - (r_i + max_b Q_A(s'_i, b)))
'''
if len(self.most_recent_episode) == 0:
# If this episode contains no data, don't do anything.
return
# Build up data sets of features and loss terms
data = np.zeros((len(self.most_recent_episode), self.max_state_features + 1))
total_loss = np.zeros(len(self.most_recent_episode))
for i, experience in enumerate(self.most_recent_episode):
# Grab the experience.
s, a, r, s_prime = experience
# Pad in case the state features are too short (as in Atari sometimes).
features = self._pad_features_with_zeros(s, a)
loss = (r + self.gamma * self.get_max_q_value(s_prime) - self.get_q_value(s, a))
# Add to relevant lists.
data[i] = features
total_loss[i] = loss
# Compute new regressor and add it to the weak learners.
estimator = GradientBoostingRegressor(loss='ls', n_estimators=1, max_depth=self.max_depth)
estimator.fit(data, total_loss)
self.weak_learners.append(estimator)
python类GradientBoostingRegressor()的实例源码
def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None):
X, y = utils.drop_missing_y_vals(X, y, output_column=None)
if isinstance(estimator, GradientBoostingRegressor):
X = X.toarray()
predictions = estimator.predict(X)
if took_log_of_y:
for idx, val in enumerate(predictions):
predictions[idx] = math.exp(val)
try:
score = self.scoring_func(y, predictions)
except ValueError:
bad_val_indices = []
for idx, val in enumerate(y):
if str(val) in bad_vals_as_strings:
bad_val_indices.append(idx)
predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]
print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset')
score = self.scoring_func(y, predictions)
if advanced_scoring == True:
if hasattr(estimator, 'name'):
print(estimator.name)
advanced_scoring_regressors(predictions, y, verbose=verbose, name=name)
return - 1 * score
gradientboostingmodel.py 文件源码
项目:Supply-demand-forecasting
作者: LevinJ
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def setClf(self):
self.clf = GradientBoostingRegressor(n_estimators=100, verbose=100)
# self.clf = GradientBoostingRegressor(loss = 'ls', verbose = 300, n_estimators=70, learning_rate= 0.1,subsample=1.0, max_features = 1.0)
return
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def run_batch(batch):
for num_iters, params in batch:
max_depth = params['max_depth']
learning_rate = params['learning_rate']
num_iters = int(num_iters)
reg = GradientBoostingRegressor(
learning_rate=learning_rate,
max_depth=max_depth,
n_estimators=num_iters)
reg.fit(X_train, y_train)
mse = ((reg.predict(X_test) - y_test)**2).mean()
yield mse
def greedy_elim(df):
# do feature selection using boruta
X = df[[x for x in df.columns if x!='SalePrice']]
y = df['SalePrice']
#model = RandomForestRegressor(n_estimators=50)
model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05)
# 150 features seems to be the best at the moment. Why this is is unclear.
feat_selector = RFE(estimator=model, step=1, n_features_to_select=150)
# find all relevant features
feat_selector.fit_transform(X.as_matrix(), y.as_matrix())
# check selected features
features_bool = np.array(feat_selector.support_)
features = np.array(X.columns)
result = features[features_bool]
#print(result)
# check ranking of features
features_rank = feat_selector.ranking_
#print(features_rank)
rank = features_rank[features_bool]
#print(rank)
return result
def convert(model, input_features, output_features):
"""Convert a boosted tree model to protobuf format.
Parameters
----------
decision_tree : GradientBoostingRegressor
A trained scikit-learn tree model.
input_feature: [str]
Name of the input columns.
output_features: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _ensemble.GradientBoostingRegressor)
def is_gbr_model(m):
if len(m.estimators_) == 0:
return False
if hasattr(m, 'estimators_') and m.estimators_ is not None:
for t in m.estimators_.flatten():
if not hasattr(t, 'tree_') or t.tree_ is None:
return False
return True
else:
return False
_sklearn_util.check_fitted(model, is_gbr_model)
base_prediction = model.init_.mean
return _MLModel(_convert_tree_ensemble(model, input_features, output_features,
base_prediction = base_prediction))
def model_fit_and_test(TrainX,TrainY,TestX,TestY):
def bulid_model(model_name):
model = model_name()
return model
#for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]:
for model_name in [LinearRegression, ElasticNet]:
model = bulid_model(model_name)
model.fit(TrainX,TrainY)
print(model_name)
resid = model.predict(TestX) - TestY
#print resid
print("Residual sum of squares: %f"% np.mean(resid ** 2))
#print model.predict(TestX)
#print TestY
# Explained variance score: 1 is perfect prediction
plt.scatter(model.predict(TestX), resid);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
#plt.xlim([1, 50])
plt.show()
print('Variance score: %.2f' % model.score(TestX, TestY))
from statsmodels.stats.stattools import jarque_bera
_, pvalue, _, _ = jarque_bera(resid)
print ("Test Residuals Normal", pvalue)
from statsmodels import regression, stats
import statsmodels.api as sms
import statsmodels.stats.diagnostic as smd
# xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4)))
xs_with_constant = sms.add_constant(TestX)
_, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant)
print ("Test Heteroskedasticity", pvalue1)
ljung_box = smd.acorr_ljungbox(resid, lags=10)
#print "Lagrange Multiplier Statistics:", ljung_box[0]
print "Test Autocorrelation P-values:", ljung_box[1]
if any(ljung_box[1] < 0.05):
print "The residuals are autocorrelated."
else:
print "The residuals are not autocorrelated."
def fit_boosting(X, y, window=100000, estimators=250, learning=.01,
samples_leaf=500, depth=20, validate=False):
'''
Fits Gradient Boosting
'''
model = GradientBoostingRegressor(n_estimators=estimators,
learning_rate=learning,
min_samples_leaf=samples_leaf,
max_depth=depth,
random_state=42)
if validate:
return cross_validate(X, y, model, window)
return model.fit(X, y)
def __init__(self, **kwargs):
#print("kwargs=", kwargs)
self.is_boxcox = kwargs.get("is_boxcox", False)
self.boxcox_lambda = kwargs.get("boxcox_lambda", 0.0)
self.Model = kwargs.get("model", GradientBoostingRegressor)
if "is_boxcox" in kwargs:
kwargs.pop("is_boxcox")
if "boxcox_lambda" in kwargs:
kwargs.pop("boxcox_lambda")
if "model" in kwargs:
kwargs.pop("model")
self.clf = self.Model(**kwargs)
def gradient_boost_grid_search():
gradient_boost_grid = {
'loss': ['ls', 'lad', 'huber', 'quantile'],
'learning_rate': [.0001, .001, .01, .1, 1],
'n_estimators': [50, 100, 1000, 10000],
'max_depth': [1, 3],
'min_samples_split': [2, 4, 10],
'max_features': ['sqrt', 'log2'],
}
gb = GradientBoostingRegressor()
return gradient_boost_grid, gb
def convert(model, input_features, output_features):
"""Convert a boosted tree model to protobuf format.
Parameters
----------
decision_tree : GradientBoostingRegressor
A trained scikit-learn tree model.
input_feature: [str]
Name of the input columns.
output_features: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _ensemble.GradientBoostingRegressor)
def is_gbr_model(m):
if len(m.estimators_) == 0:
return False
if hasattr(m, 'estimators_') and m.estimators_ is not None:
for t in m.estimators_.flatten():
if not hasattr(t, 'tree_') or t.tree_ is None:
return False
return True
else:
return False
_sklearn_util.check_fitted(model, is_gbr_model)
base_prediction = model.init_.mean
return _MLModel(_convert_tree_ensemble(model, input_features, output_features,
base_prediction = base_prediction))
def setUpClass(cls):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not HAS_SKLEARN:
return
scikit_data = load_boston()
scikit_model = GradientBoostingRegressor(random_state = 1)
scikit_model.fit(scikit_data['data'], scikit_data['target'])
# Save the data and the model
cls.scikit_data = scikit_data
cls.scikit_model = scikit_model
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = GradientBoostingRegressor()
spec = skl_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
with self.assertRaises(Exception):
model = OneHotEncoder()
spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(TypeError):
model = GradientBoostingRegressor()
spec = xgb_converter.convert(model, 'data', 'out')
# Check the expected class during conversion
with self.assertRaises(TypeError):
model = OneHotEncoder()
spec = xgb_converter.convert(model, 'data', 'out')
def test_model_select_by_param():
iris = load_iris()
gbdt = GradientBoostingRegressor()
parameters = {'n_estimators': [1000, 5000], 'max_depth':[3,4]}
grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(iris.data[:150],iris.target[:150])
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None):
X, y = utils.drop_missing_y_vals(X, y, output_column=None)
if isinstance(estimator, GradientBoostingRegressor):
X = X.toarray()
predictions = estimator.predict(X)
if took_log_of_y:
for idx, val in enumerate(predictions):
predictions[idx] = math.exp(val)
try:
score = self.scoring_func(y, predictions)
except ValueError:
bad_val_indices = []
for idx, val in enumerate(y):
if str(val) in bad_vals_as_strings or str(predictions[idx]) in bad_vals_as_strings:
bad_val_indices.append(idx)
predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]
print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the predicted or y values. We will ignore these, and report the score on the rest of the dataset')
score = self.scoring_func(y, predictions)
if advanced_scoring == True:
if hasattr(estimator, 'name'):
print(estimator.name)
advanced_scoring_regressors(predictions, y, verbose=verbose, name=name)
return - 1 * score
def regression_with_GBR(X_train, y_train, X_test, y_test, parmsFromNormalization, params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01, 'loss': 'ls'}):
#GradientBoostingRegressor
gfr = GradientBoostingRegressor(**params)
gfr.fit(X_train, y_train)
y_pred_gbr = gfr.predict(X_test)
print_regression_model_summary("GBR", y_test, y_pred_gbr, parmsFromNormalization)
print_feature_importance(X_test, y_test,gfr.feature_importances_)
#cross validation ( not sure this make sense for regression
#http://scikit-learn.org/stable/modules/cross_validation.html
#gfr = GradientBoostingRegressor(**params)
#scores = cross_validation.cross_val_score(gfr, X_train, y_train, cv=5)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
return y_pred_gbr
def GDBT_regression(X=train_split,Y=y):
est = GradientBoostingRegressor(n_estimators=75,max_depth=3,learning_rate=0.1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
est.fit(X_train,Y_train)
y_train_pred = est.predict(X_test)
plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data')
plt.title("Linear regression with GDBT")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
plt.show()
# Plot predictions
plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data")
plt.title("Linear regression with GDBT")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc="upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
plt.show()
print('rmse value:',rmsle(Y_test,y_train_pred))
return est
# linear_regression()
# ridge_regression()
# Lasso_regression()
#model = Elasticnet_regression()
# '''
# predict final result
# '''
#
#
# coefs,lasso = Lasso_regression()
# selected_features = coefs[coefs['value'] != 0].index.values
# train_new = train_split[selected_features]
def gbr(X,y):
X_train,X_validation,y_train,y_validation = train_test_split(X,y,random_state=0)
sklearn_boost = GradientBoostingRegressor(random_state=1)
sklearn_boost.fit(X_train,y_train.ravel())
print 'training error:',1.0 - sklearn_boost.score(X_train,y_train)
print 'validation error:',1.0 - sklearn_boost.score(X_validation,y_validation)
time_fit(sklearn_boost,X_train,y_train.ravel())