def cross_validate_best_known():
'''
import and clean the tractor data, then do a corss validation on each of the three models we are
training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print
the scores.
The parameters we're using here are the "best" that we've found so far using a grid search.
'''
tractor_data = pd.read_csv('data/train.csv')
tractor_data = cln.clean_all(tractor_data)
X = tractor_data
y = tractor_data.pop('SalePrice')
rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2)
gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1)
ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3)
ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000)
validate.cross_v_scores([rf, gb, ab], X, y)
# RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946
# GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829
# AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393
python类AdaBoostRegressor()的实例源码
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def test_AdaBoostRegressor(*data):
'''
test the regression with different number of regression model
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
regr=ensemble.AdaBoostRegressor()
regr.fit(X_train,y_train)
## graph
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
estimators_num=len(regr.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label="Traing score")
ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label="Testing score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="best")
ax.set_title("AdaBoostRegressor")
plt.show()
def test_AdaBoostRegressor_learning_rate(*data):
'''
test the performance with different learning rate
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
learning_rates=np.linspace(0.01,1)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
traing_scores=[]
testing_scores=[]
for learning_rate in learning_rates:
regr=ensemble.AdaBoostRegressor(learning_rate=learning_rate,n_estimators=500)
regr.fit(X_train,y_train)
traing_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(learning_rates,traing_scores,label="Traing score")
ax.plot(learning_rates,testing_scores,label="Testing score")
ax.set_xlabel("learning rate")
ax.set_ylabel("score")
ax.legend(loc="best")
ax.set_title("AdaBoostRegressor")
plt.show()
def test_gridsearch():
# Check that base trees can be grid-searched.
# AdaBoost classification
boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters = {'n_estimators': (1, 2),
'base_estimator__max_depth': (1, 2),
'algorithm': ('SAMME', 'SAMME.R')}
clf = GridSearchCV(boost, parameters)
clf.fit(iris.data, iris.target)
# AdaBoost regression
boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
random_state=0)
parameters = {'n_estimators': (1, 2),
'base_estimator__max_depth': (1, 2)}
clf = GridSearchCV(boost, parameters)
clf.fit(boston.data, boston.target)
def test_sample_weight_adaboost_regressor():
"""
AdaBoostRegressor should work without sample_weights in the base estimator
The random weighted sampling is done internally in the _boost method in
AdaBoostRegressor.
"""
class DummyEstimator(BaseEstimator):
def fit(self, X, y):
pass
def predict(self, X):
return np.zeros(X.shape[0])
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
boost.fit(X, y_regr)
assert_equal(len(boost.estimator_weights_), len(boost.estimator_errors_))
def setClf(self):
# min_samples_split = 3
self.clf = AdaBoostRegressor()
return
def model_fit_and_test(TrainX,TrainY,TestX,TestY):
def bulid_model(model_name):
model = model_name()
return model
#for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]:
for model_name in [LinearRegression, ElasticNet]:
model = bulid_model(model_name)
model.fit(TrainX,TrainY)
print(model_name)
resid = model.predict(TestX) - TestY
#print resid
print("Residual sum of squares: %f"% np.mean(resid ** 2))
#print model.predict(TestX)
#print TestY
# Explained variance score: 1 is perfect prediction
plt.scatter(model.predict(TestX), resid);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
#plt.xlim([1, 50])
plt.show()
print('Variance score: %.2f' % model.score(TestX, TestY))
from statsmodels.stats.stattools import jarque_bera
_, pvalue, _, _ = jarque_bera(resid)
print ("Test Residuals Normal", pvalue)
from statsmodels import regression, stats
import statsmodels.api as sms
import statsmodels.stats.diagnostic as smd
# xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4)))
xs_with_constant = sms.add_constant(TestX)
_, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant)
print ("Test Heteroskedasticity", pvalue1)
ljung_box = smd.acorr_ljungbox(resid, lags=10)
#print "Lagrange Multiplier Statistics:", ljung_box[0]
print "Test Autocorrelation P-values:", ljung_box[1]
if any(ljung_box[1] < 0.05):
print "The residuals are autocorrelated."
else:
print "The residuals are not autocorrelated."
def __init__(self, isTrain):
super(RegressionAdaBoost, self).__init__(isTrain)
# data preprocessing
#self.dataPreprocessing()
# Create AdaBoost regression object
decisionReg = DecisionTreeRegressor(max_depth=10)
rng = np.random.RandomState(1)
self.adaReg = AdaBoostRegressor(decisionReg,
n_estimators=400,
random_state=rng)
def ada_boost_tree_grid_search():
ada_boost_tree_grid = {
'base_estimator__max_features': ['sqrt'],
'base_estimator__splitter': ['best', 'random'],
'base_estimator__min_samples_split': [2, 4],
'base_estimator__max_depth': [1, 3],
'n_estimators': [50, 100, 1000],
'learning_rate': [.001, .01, .1],
'loss': ['linear', 'square', 'exponential']
}
abr = AdaBoostRegressor(DecisionTreeRegressor())
return ada_boost_tree_grid, abr
def adbPredictor(df):
dataTrainX, dataTrainY, dataTestX, dataTestY = sample(df)
# clf = linear_model.SGDRegressor()
clf = ensemble.AdaBoostRegressor()
clf.fit(dataTrainX, dataTrainY)
predicted = clf.predict(dataTestX)
fig, ax = plotter.subplots()
ax.set_ylabel('Predicted KNN Weekly')
ax.scatter(dataTestY, predicted)
ax.set_xlabel('Measured')
predicted = np.reshape(predicted, (predicted.size, 1))
corrCoeff = pearsonr(dataTestY,predicted)
print(corrCoeff[0])
plotter.show()
return predicted
def __init__(self, conf, model=None):
self.conf = conf
self.name = "AdaBoostR"
if model is None:
self.model = AdaBoostRegressor(loss='square')
else:
self.model = model
def get_models4ensamble(conf):
models = []
#models = [RFRModel(conf), DLModel(conf), LRModel(conf)]
#models = [LRModel(conf)]
# see http://scikit-learn.org/stable/modules/linear_model.html
#0 was too big to run with depth set to 1, and 1 was overfitting a bit
if conf.command == 1:
xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":3, "eta":0.1, "min_child_weight":5,
"subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
else:
xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8,
"subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
#xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8,
# "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
models = [
#DLModel(conf),
#LRModel(conf, model=linear_model.BayesianRidge()),
#LRModel(conf, model=linear_model.LassoLars(alpha=.1)),
#LRModel(conf, model=linear_model.Lasso(alpha = 0.1)),
#LRModel(conf, model=Pipeline([('poly', PolynomialFeatures(degree=3)),
#LRModel(conf, model=linear_model.Ridge (alpha = .5))
# ('linear', LinearRegression(fit_intercept=False))])),
XGBoostModel(conf, xgb_params, use_cv=True),
LRModel(conf, model=linear_model.Lasso(alpha = 0.3)),
RFRModel(conf, RandomForestRegressor(oob_score=True, n_jobs=4)),
#LRModel(conf, model=linear_model.Lasso(alpha = 0.2)),
ETRModel(conf, model=ExtraTreesRegressor(n_jobs=4)),
#AdaBoostRModel(conf, model=AdaBoostRegressor(loss='square'))
]
return models
#return [XGBoostModel(conf, xgb_params, use_cv=True)]
def abr(X,y):
X_train,X_validation,y_train,y_validation = train_test_split(X,y,random_state=0)
abr_boost = AdaBoostRegressor(random_state=1)
abr_boost.fit(X_train,y_train.ravel())
print 'training error:',1.0 - abr_boost.score(X_train,y_train)
print 'validation error:',1.0 - abr_boost.score(X_validation,y_validation)
time_fit(abr_boost,X_train,y_train.ravel())
def get_classifier(self, X, Y):
""" ????????
:param X: ????
:param Y: ??????
:return: ??
"""
# rng = np.random.RandomState(1)
clf = AdaBoostRegressor(DecisionTreeRegressor())
clf.fit(X, Y)
return clf
def test_AdaBoostRegressor_base_regr(*data):
'''
test the regression with different number of model and regression method
:param data: train_data, test_data, train_value, test_value
:return: None
'''
from sklearn.svm import LinearSVR
X_train,X_test,y_train,y_test=data
fig=plt.figure()
regrs=[ensemble.AdaBoostRegressor(),
ensemble.AdaBoostRegressor(base_estimator=LinearSVR(epsilon=0.01,C=100))]
labels=["Decision Tree Regressor","Linear SVM Regressor"]
for i ,regr in enumerate(regrs):
ax=fig.add_subplot(2,1,i+1)
regr.fit(X_train,y_train)
## graph
estimators_num=len(regr.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label="Traing score")
ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label="Testing score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1)
ax.set_title("Base_Estimator:%s"%labels[i])
plt.suptitle("AdaBoostRegressor")
plt.show()
def test_AdaBoostRegressor_loss(*data):
'''
test the method with different loss function
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
losses=['linear','square','exponential']
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
for i ,loss in enumerate(losses):
regr=ensemble.AdaBoostRegressor(loss=loss,n_estimators=30)
regr.fit(X_train,y_train)
## graph
estimators_num=len(regr.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(regr.staged_score(X_train,y_train)),
label="Traing score:loss=%s"%loss)
ax.plot(list(X),list(regr.staged_score(X_test,y_test)),
label="Testing score:loss=%s"%loss)
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1)
plt.suptitle("AdaBoostRegressor")
plt.show()
def test_regression_toy():
# Check classification on a toy dataset.
clf = AdaBoostRegressor(random_state=0)
clf.fit(X, y_regr)
assert_array_equal(clf.predict(T), y_t_regr)
def test_boston():
# Check consistency on dataset boston house prices.
clf = AdaBoostRegressor(random_state=0)
clf.fit(boston.data, boston.target)
score = clf.score(boston.data, boston.target)
assert score > 0.85
def test_pickle():
# Check pickability.
import pickle
# Adaboost classifier
for alg in ['SAMME', 'SAMME.R']:
obj = AdaBoostClassifier(algorithm=alg)
obj.fit(iris.data, iris.target)
score = obj.score(iris.data, iris.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert_equal(type(obj2), obj.__class__)
score2 = obj2.score(iris.data, iris.target)
assert_equal(score, score2)
# Adaboost regressor
obj = AdaBoostRegressor(random_state=0)
obj.fit(boston.data, boston.target)
score = obj.score(boston.data, boston.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert_equal(type(obj2), obj.__class__)
score2 = obj2.score(boston.data, boston.target)
assert_equal(score, score2)
def test_sample_weight_missing():
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
clf = AdaBoostClassifier(KMeans(), algorithm="SAMME")
assert_raises(ValueError, clf.fit, X, y_regr)
clf = AdaBoostRegressor(KMeans())
assert_raises(ValueError, clf.fit, X, y_regr)
def get_classifier(self, X, Y):
""" ????????
:param X: ????
:param Y: ??????
:return: ??
"""
# rng = np.random.RandomState(1)
clf = AdaBoostRegressor(DecisionTreeRegressor(criterion='mse'))
clf.fit(X, Y)
return clf
def parameterChoosing(self):
dts = []
dts.append(DecisionTreeRegressor(max_depth=5, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=7, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=9, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=11, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=12, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=14, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=15, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=17, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=19, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=21, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=22, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=24, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=26, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=27, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=31, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=33, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=35, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=37, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=39, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=41, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=43, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=45, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=47, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=49, max_features='auto'))
dts.append(DecisionTreeRegressor(max_depth=50, max_features='auto'))
tuned_parameters = [{'base_estimator': dts,
'n_estimators': range(5,700),
'learning_rate': [1, 2, 3]
}
]
reg = GridSearchCV(AdaBoostRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
reg.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print reg.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in reg.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "MSE for test data set:\n"
y_true, y_pred = self.y_test, reg.predict(self.X_test)
print mean_squared_error(y_true, y_pred)
RegressionUniformBlending.py 文件源码
项目:AirTicketPredicting
作者: junlulocky
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def __init__(self, isTrain):
super(RegressionUniformBlending, self).__init__(isTrain)
# data preprocessing
#self.dataPreprocessing()
self.net1 = NeuralNet(
layers=[ # three layers: one hidden layer
('input', layers.InputLayer),
('hidden', layers.DenseLayer),
#('hidden2', layers.DenseLayer),
#('hidden3', layers.DenseLayer),
('output', layers.DenseLayer),
],
# layer parameters:
input_shape=(None, 13), # input dimension is 13
hidden_num_units=6, # number of units in hidden layer
#hidden2_num_units=8, # number of units in hidden layer
#hidden3_num_units=4, # number of units in hidden layer
output_nonlinearity=None, # output layer uses sigmoid function
output_num_units=1, # output dimension is 1
# obejctive function
objective_loss_function = lasagne.objectives.squared_error,
# optimization method:
update=lasagne.updates.nesterov_momentum,
update_learning_rate=0.002,
update_momentum=0.4,
# use 25% as validation
train_split=TrainSplit(eval_size=0.2),
regression=True, # flag to indicate we're dealing with regression problem
max_epochs=100, # we want to train this many epochs
verbose=0,
)
# Create linear regression object
self.linRegr = linear_model.LinearRegression()
# Create KNN regression object
self.knn = neighbors.KNeighborsRegressor(86, weights='distance')
# Create Decision Tree regression object
self.decisionTree = DecisionTreeRegressor(max_depth=7, max_features=None)
# Create AdaBoost regression object
decisionReg = DecisionTreeRegressor(max_depth=10)
rng = np.random.RandomState(1)
self.adaReg = AdaBoostRegressor(decisionReg,
n_estimators=400,
random_state=rng)
# Create linear regression object
self.model = RandomForestRegressor(max_features='sqrt', n_estimators=32, max_depth=39)
def test_staged_predict():
# Check staged predictions.
rng = np.random.RandomState(0)
iris_weights = rng.randint(10, size=iris.target.shape)
boston_weights = rng.randint(10, size=boston.target.shape)
# AdaBoost classification
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
predictions = clf.predict(iris.data)
staged_predictions = [p for p in clf.staged_predict(iris.data)]
proba = clf.predict_proba(iris.data)
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
staged_scores = [
s for s in clf.staged_score(
iris.data, iris.target, sample_weight=iris_weights)]
assert_equal(len(staged_predictions), 10)
assert_array_almost_equal(predictions, staged_predictions[-1])
assert_equal(len(staged_probas), 10)
assert_array_almost_equal(proba, staged_probas[-1])
assert_equal(len(staged_scores), 10)
assert_array_almost_equal(score, staged_scores[-1])
# AdaBoost regression
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
clf.fit(boston.data, boston.target, sample_weight=boston_weights)
predictions = clf.predict(boston.data)
staged_predictions = [p for p in clf.staged_predict(boston.data)]
score = clf.score(boston.data, boston.target, sample_weight=boston_weights)
staged_scores = [
s for s in clf.staged_score(
boston.data, boston.target, sample_weight=boston_weights)]
assert_equal(len(staged_predictions), 10)
assert_array_almost_equal(predictions, staged_predictions[-1])
assert_equal(len(staged_scores), 10)
assert_array_almost_equal(score, staged_scores[-1])
def test_sparse_regression():
# Check regression with sparse input.
class CustomSVR(SVR):
"""SVR variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super(CustomSVR, self).fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
dok_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
# Trained on sparse format
sparse_classifier = AdaBoostRegressor(
base_estimator=CustomSVR(),
random_state=1
).fit(X_train_sparse, y_train)
# Trained on dense format
dense_classifier = dense_results = AdaBoostRegressor(
base_estimator=CustomSVR(),
random_state=1
).fit(X_train, y_train)
# predict
sparse_results = sparse_classifier.predict(X_test_sparse)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
# staged_predict
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
dense_results = dense_classifier.staged_predict(X_test)
for sprase_res, dense_res in zip(sparse_results, dense_results):
assert_array_equal(sprase_res, dense_res)
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([(t == csc_matrix or t == csr_matrix)
for t in types])