def model_extra_trees_regression(Xtrain,Xtest,ytrain):
X_train = Xtrain
y_train = ytrain
etr = ExtraTreesRegressor(n_jobs=1, random_state=0)
param_grid = {}#'n_estimators': [500], 'max_features': [10,15,20]}
model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
model.fit(X_train, y_train)
print('Extra trees regression...')
print('Best Params:')
print(model.best_params_)
print('Best CV Score:')
print(-model.best_score_)
y_pred = model.predict(Xtest)
return y_pred, -model.best_score_
# read data, build model and do prediction
# read train data
python类ExtraTreesRegressor()的实例源码
extra_trees_preproc_for_regression.py 文件源码
项目:AutoML-Challenge
作者: postech-mlg-exbrain
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def fit(self, X, Y):
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
num_features = X.shape[1]
max_features = int(
float(self.max_features) * (np.log(num_features) + 1))
# Use at most half of the features
max_features = max(1, min(int(X.shape[1] / 2), max_features))
preprocessor = ExtraTreesRegressor(
n_estimators=self.n_estimators, criterion=self.criterion,
max_depth=self.max_depth, min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
random_state=self.random_state)
preprocessor.fit(X, Y)
self.preprocessor = SelectFromModel(preprocessor, prefit=True)
return self
def fit(self, X, y):
"""
Fit a Random Forest model to data `X` and targets `y`.
Parameters
----------
X : array-like
Input values.
y: array-like
Target values.
"""
self.X = X
self.y = y
self.n = self.X.shape[0]
self.model = ExtraTreesRegressor(**self.params)
self.model.fit(X, y)
def exrf(train_sample, validation_sample, features, seed):
log_base = np.e
exrf_est = ExtraTreesRegressor(n_estimators=1000,
criterion='mse',
max_features='auto',
max_depth=None,
bootstrap=True,
min_samples_split=4,
min_samples_leaf=1,
min_weight_fraction_leaf=0,
max_leaf_nodes=None,
random_state=seed
).fit(
train_sample[features], np.log1p(train_sample['volume']) / np.log(log_base))
exrf_prob = np.power(log_base, exrf_est.predict(validation_sample[features])) - 1
print_mape(validation_sample['volume'], exrf_prob, 'EXTRA-RF')
return exrf_prob
def try_params( n_iterations, params ):
n_estimators = int( round( n_iterations * trees_per_iteration ))
print "n_estimators:", n_estimators
pprint( params )
clf = XT( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params )
return train_and_eval_sklearn_regressor( clf, data )
def train(self):
""""""
print('size before truncated outliers is %d ' % len(self.TrainData))
self.TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)]
print('size after truncated outliers is %d ' % len(self.TrainData))
X = self.TrainData.drop(self._l_drop_cols, axis=1)
Y = self.TrainData['logerror']
self._l_train_columns = X.columns
FeatCols = list(self._l_train_columns)
etr = ExtraTreesRegressor(
n_estimators= self._iter,
criterion= 'mse',
max_features= int(math.sqrt(len(FeatCols))),
max_depth = self._depth,
n_jobs= 2,
random_state= 2017,
verbose= True
)
self._model = etr.fit(X, Y)
## evaluate on valid data
self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,
datetime.now().strftime('%Y%m%d-%H:%M:%S'))
with open(self._f_eval_train_model, 'wb') as o_file:
pickle.dump(self._model, o_file, -1)
o_file.close()
self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]],
ignore_index=True) ## ignore_index will reset the index or index will be overlaped
return
def iterative_fit(self, X, y, n_iter=1, refit=False):
from sklearn.ensemble import ExtraTreesRegressor as ETR
if refit:
self.estimator = None
if self.estimator is None:
num_features = X.shape[1]
max_features = int(
float(self.max_features) * (np.log(num_features) + 1))
# Use at most half of the features
max_features = max(1, min(int(X.shape[1] / 2), max_features))
self.estimator = ETR(
n_estimators=0, criterion=self.criterion,
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
bootstrap=self.bootstrap,
max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
oob_score=self.oob_score, n_jobs=self.n_jobs,
verbose=self.verbose,
random_state=self.random_state,
warm_start=True
)
tmp = self.estimator # TODO copy ?
tmp.n_estimators += n_iter
tmp.fit(X, y,)
self.estimator = tmp
return self
def __init__(self, **params):
"""
Wrapper around sklearn's ExtraTreesRegressor implementation for pyGPGO.
Random Forests can also be used for surrogate models in Bayesian Optimization.
An estimate of 'posterior' variance can be obtained by using the `impurity`
criterion value in each subtree.
Parameters
----------
params: tuple, optional
Any parameters to pass to `RandomForestRegressor`. Defaults to sklearn's.
"""
self.params = params
def fit(self, X, y):
self.clf = ExtraTreesRegressor()
#y = np.log(y)
self.clf.fit(X, y)
def __init__(self, conf, model=None):
self.conf = conf
self.name = "ETR"
if model is None:
self.model = ExtraTreesRegressor(n_jobs=4)
else:
self.model = model
def get_models4ensamble(conf):
models = []
#models = [RFRModel(conf), DLModel(conf), LRModel(conf)]
#models = [LRModel(conf)]
# see http://scikit-learn.org/stable/modules/linear_model.html
#0 was too big to run with depth set to 1, and 1 was overfitting a bit
if conf.command == 1:
xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":3, "eta":0.1, "min_child_weight":5,
"subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
else:
xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8,
"subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
#xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8,
# "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0}
models = [
#DLModel(conf),
#LRModel(conf, model=linear_model.BayesianRidge()),
#LRModel(conf, model=linear_model.LassoLars(alpha=.1)),
#LRModel(conf, model=linear_model.Lasso(alpha = 0.1)),
#LRModel(conf, model=Pipeline([('poly', PolynomialFeatures(degree=3)),
#LRModel(conf, model=linear_model.Ridge (alpha = .5))
# ('linear', LinearRegression(fit_intercept=False))])),
XGBoostModel(conf, xgb_params, use_cv=True),
LRModel(conf, model=linear_model.Lasso(alpha = 0.3)),
RFRModel(conf, RandomForestRegressor(oob_score=True, n_jobs=4)),
#LRModel(conf, model=linear_model.Lasso(alpha = 0.2)),
ETRModel(conf, model=ExtraTreesRegressor(n_jobs=4)),
#AdaBoostRModel(conf, model=AdaBoostRegressor(loss='square'))
]
return models
#return [XGBoostModel(conf, xgb_params, use_cv=True)]
def __init__(self, n_action, gamma=0.99):
self.Q = map(lambda x: ExtraTreesRegressor(n_estimators=50),[None]*n_action)
self.n_action = n_action
self.gamma = gamma
self.first_time = True
def featureImp(dataset1):
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesRegressor
import collections
#f = open('F:\kaggle\Final Project\\Book.txt')
# f.readline() # skip the header
#dataset = np.loadtxt(fname=f, delimiter=',')
# dataset = datasets.load_iris()
# fit an Extra Trees model to the data
# print(dataset)
mapElement = {}
X = dataset1[:, 1:406]
Y = dataset1[:, 0]
num_trees = 10
max_feature = 7
model = ExtraTreesRegressor(n_estimators=num_trees, max_features=max_feature)
model.fit(X, Y)
z = model.feature_importances_
#print("first", z.item(0))
for i in range(len(z)):
mapElement[z.item(i)] = (i + 1)
# od = collections.OrderedDict(sorted(mapElement.items()))
p = sorted(mapElement)
#print(p)
result = []
for i in range(len(p)):
result.append(mapElement.get(p[(len(p) - 1) - i]))
return (result)
#print(result)
# print(type(od))
#print(mapElement)
# print(od)
# model.fit(dataset.data, dataset.target)
# display the relative importance of each attribute
#print(model.feature_importances_)
def train(self):
print "start ert"
self.model = ExtraTreesRegressor(n_jobs=self.prms["n_jobs"],
verbose=1,
random_state=self.prms["random_state"],
n_estimators=int(self.prms["n_estimators"]),
max_features=self.prms["max_features"])
self.model.fit(self.data_tr.values, self.labels_tr)
def test(x_file, y_file, train_list, test_list, best_params):
X1 = np.loadtxt(x_file, delimiter=",")
Y1 = np.loadtxt(y_file, delimiter=",")
train_X, train_Y, test_X, _ = split_train_val(X1, Y1, train_list, test_list)
# print train_X.shape,test_X.shape
EXT1 = ExtraTreesRegressor(n_jobs=-1, random_state=1, **best_params)
EXT1.fit(train_X, train_Y)
test_Y1 = EXT1.predict(test_X)
# print EXT1.feature_importances_
return test_Y1
def predict(x_file, y_file, test_x_file, best_params):
X1 = np.loadtxt(x_file, delimiter=",")
Y1 = np.loadtxt(y_file, delimiter=",")
test_X1 = np.loadtxt(test_x_file, delimiter=",")
EXT1 = ExtraTreesRegressor(n_jobs=-1, random_state=1, **best_params)
EXT1.fit(X1, Y1)
test_Y1 = EXT1.predict(test_X1) # * NOR.scale_ + NOR.mean_
# print test_Y1
# print EXT1.feature_importances_
return test_Y1
def get_model_list():
model_list, name_list = [], []
# model_list.append(linear_model.LinearRegression())
# name_list.append('LR')
# model_list.append(gaussian_process.GaussianProcessRegressor(alpha=1e-10))
# name_list.append('GaussianProcess')
# model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=28))
# name_list.append('KNN_unif')
#
# model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=28))
# name_list.append('KNN_dist')
#
# model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
# name_list.append('SVR_poly')
# #
model_list.append(SVR(kernel = 'rbf', C = 0.3, gamma = 'auto'))
name_list.append('SVR_rbf')
# #
# model_list.append(DecisionTreeRegressor())
# name_list.append('DT')
#
# model_list.append(RandomForestRegressor(n_estimators=150, max_depth=None,min_samples_split=2, random_state=0))
# name_list.append('RF')
#
# model_list.append(ExtraTreesRegressor(n_estimators=150, max_depth=None, max_features='auto', min_samples_split=2, random_state=0))
# name_list.append('ET')
return model_list,name_list
#MAPE
def get_model_list():
model_list, name_list = [], []
# model_list.append(linear_model.LinearRegression())
# name_list.append('LR')
# model_list.append(gaussian_process.GaussianProcessRegressor(alpha=1e-10))
# name_list.append('GaussianProcess')
# model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=28))
# name_list.append('KNN_unif')
#
# model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=28))
# name_list.append('KNN_dist')
#
# model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
# name_list.append('SVR_poly')
# #
model_list.append(SVR(kernel = 'rbf', C = 0.3, gamma = 'auto'))
name_list.append('SVR_rbf')
# #
# model_list.append(DecisionTreeRegressor())
# name_list.append('DT')
#
# model_list.append(RandomForestRegressor(n_estimators=150, max_depth=None,min_samples_split=2, random_state=0))
# name_list.append('RF')
#
# model_list.append(ExtraTreesRegressor(n_estimators=150, max_depth=None, max_features='auto', min_samples_split=2, random_state=0))
# name_list.append('ET')
return model_list,name_list
#????
def models():
extra_params_kaggle_cla = {'n_estimators':1200,'max_features':30,'criterion':'entropy',
'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}
extra_params_kaggle_reg = {'n_estimators':1200,'max_features':30,'criterion':'mse',
'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}
xgb_reg = {'objective':'reg:linear', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}
xgb_cla = {'objective':'binary:logistic', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}
#NN params
nb_epoch = 3
batch_size = 128
esr = 402
param1 = {
'hidden_units': (256, 256),
'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
}
param2 = {
'hidden_units': (1024, 1024),
'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
}
clfs = [
(D2, XGBClassifier(**xgb_cla)),
(D11, XGBClassifier(**xgb_cla)),
(D2, XGBRegressor(**xgb_reg)),
(D11, XGBRegressor(**xgb_reg)),
(D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
(D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
(D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
(D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
# (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)),
# (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
# (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
#
# (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
# (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
# (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2))
]
for clf in clfs:
yield clf
def _get_learner(self):
# xgboost
if self.learner_name in ["reg_xgb_linear", "reg_xgb_tree", "reg_xgb_tree_best_single_model"]:
return XGBRegressor(**self.param_dict)
if self.learner_name in ["clf_xgb_linear", "clf_xgb_tree"]:
return XGBClassifier(**self.param_dict)
# sklearn
if self.learner_name == "reg_skl_lasso":
return Lasso(**self.param_dict)
if self.learner_name == "reg_skl_ridge":
return Ridge(**self.param_dict)
if self.learner_name == "reg_skl_random_ridge":
return RandomRidge(**self.param_dict)
if self.learner_name == "reg_skl_bayesian_ridge":
return BayesianRidge(**self.param_dict)
if self.learner_name == "reg_skl_svr":
return SVR(**self.param_dict)
if self.learner_name == "reg_skl_lsvr":
return LinearSVR(**self.param_dict)
if self.learner_name == "reg_skl_knn":
return KNNRegressor(**self.param_dict)
if self.learner_name == "reg_skl_etr":
return ExtraTreesRegressor(**self.param_dict)
if self.learner_name == "reg_skl_rf":
return RandomForestRegressor(**self.param_dict)
if self.learner_name == "reg_skl_gbm":
return GradientBoostingRegressor(**self.param_dict)
if self.learner_name == "reg_skl_adaboost":
return AdaBoostRegressor(**self.param_dict)
# keras
if self.learner_name == "reg_keras_dnn":
try:
return KerasDNNRegressor(**self.param_dict)
except:
return None
# rgf
if self.learner_name == "reg_rgf":
return RGFRegressor(**self.param_dict)
# ensemble
if self.learner_name == "reg_ensemble":
return EnsembleLearner(**self.param_dict)
return None
def test_distribution():
rng = check_random_state(12321)
# Single variable with 4 values
X = rng.randint(0, 4, size=(1000, 1))
y = rng.rand(1000)
n_trees = 500
clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
uniques = defaultdict(int)
for tree in clf.estimators_:
tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
for f, t in zip(tree.tree_.feature,
tree.tree_.threshold))
uniques[tree] += 1
uniques = sorted([(1. * count / n_trees, tree)
for tree, count in uniques.items()])
# On a single variable problem where X_0 has 4 equiprobable values, there
# are 5 ways to build a random tree. The more compact (0,1/0,0/--0,2/--) of
# them has probability 1/3 while the 4 others have probability 1/6.
assert_equal(len(uniques), 5)
assert_greater(0.20, uniques[0][0]) # Rough approximation of 1/6.
assert_greater(0.20, uniques[1][0])
assert_greater(0.20, uniques[2][0])
assert_greater(0.20, uniques[3][0])
assert_greater(uniques[4][0], 0.3)
assert_equal(uniques[4][1], "0,1/0,0/--0,2/--")
# Two variables, one with 2 values, one with 3 values
X = np.empty((1000, 2))
X[:, 0] = np.random.randint(0, 2, 1000)
X[:, 1] = np.random.randint(0, 3, 1000)
y = rng.rand(1000)
clf = ExtraTreesRegressor(n_estimators=100, max_features=1,
random_state=1).fit(X, y)
uniques = defaultdict(int)
for tree in clf.estimators_:
tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
for f, t in zip(tree.tree_.feature,
tree.tree_.threshold))
uniques[tree] += 1
uniques = [(count, tree) for tree, count in uniques.items()]
assert_equal(len(uniques), 8)
def get_model_list(task_name):
model_list, name_list = [], []
model_list.append(linear_model.LinearRegression())
name_list.append('LR')
#
model_list.append(linear_model.SGDRegressor())
name_list.append('LR_SGD')
model_list.append(linear_model.Lasso(alpha = 1.0))
name_list.append('Lasso')
model_list.append(linear_model.Ridge (alpha = 1.0))
name_list.append('Ridge')
model_list.append(linear_model.LassoLars(alpha=.1))
name_list.append('LassoLars')
model_list.append(linear_model.BayesianRidge())
name_list.append('BayesianRidge')
model_list.append(KernelRidge(alpha=1.0))
name_list.append('KernelRidge')
model_list.append(gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1))
name_list.append('GaussianProcess')
model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=3))
name_list.append('KNN_unif')
model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=3))
name_list.append('KNN_dist')
model_list.append(SVR(kernel = 'linear', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_linear')
model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_poly')
model_list.append(SVR(kernel = 'rbf', C = 1, gamma = 'auto', coef0 = 0, degree = 2))
name_list.append('SVM_rbf')
model_list.append(DecisionTreeRegressor())
name_list.append('DT')
model_list.append(RandomForestRegressor(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0))
name_list.append('RF')
model_list.append(ExtraTreesRegressor(n_estimators=100, max_depth=None, max_features='auto', min_samples_split=2, random_state=0))
name_list.append('ET')
return model_list, name_list