def machine_learning_RF(x_train,y_train,x_test,y_test):
import numpy as np
mask = []
#Gets rid of NaNs
for i in range(np.shape(x_train)[1]):
mask.append(~np.isnan(x_train[:,i]))
mask.append(~np.isnan(np.transpose(y_train)))
mask = np.transpose(reduce(np.logical_and, mask))
mask = mask.reshape(len(mask),)
inputs = x_train[mask,:]
targets = y_train[mask]
mask2 = []
for i in range(np.shape(x_test)[1]):
mask2.append(~np.isnan(x_test[:,i]))
mask2 = np.transpose(reduce(np.logical_and, mask2))
inputs_test = x_test[mask2,:]
#End getting rid of NaNs
#Sets up forest
#n-estimators is how many "trees" (samples) you will take
from sklearn.ensemble import RandomForestRegressor
rfc_new = RandomForestRegressor(n_estimators=100,random_state=42,max_features=2)
#Training
rfc_new = rfc_new.fit(inputs,targets)
#Predicting
predicted_y = rfc_new.predict(inputs_test)
print rfc_new.feature_importances_
return y_test[mask2], predicted_y
python类RandomForestRegressor()的实例源码
def convert(model, feature_names, target):
"""Convert a boosted tree model to protobuf format.
Parameters
----------
decision_tree : RandomForestRegressor
A trained scikit-learn tree model.
feature_names: [str]
Name of the input columns.
target: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _ensemble.RandomForestRegressor)
def is_rf_model(m):
if len(m.estimators_) == 0:
return False
if hasattr(m, 'estimators_') and m.estimators_ is not None:
for t in m.estimators_:
if not hasattr(t, 'tree_') or t.tree_ is None:
return False
return True
else:
return False
_sklearn_util.check_fitted(model, is_rf_model)
return _MLModel(_convert_tree_ensemble(model, feature_names, target))
def persist_pipelines(pipelines):
Path('models').mkdir(exist_ok=True)
fp_fmt = 'models/{}-{:%y-%m-%d}.pkl'
now = dt.datetime.now()
for pipe in pipelines:
print(utils.pipeline_name(pipe))
fp_name = fp_fmt.format(utils.pipeline_name(pipe), now)
joblib.dump(pipe, fp_name)
# Pickle fails to work on RandomForestRegressor
# with open(fp_name, 'wb') as fp:
# pickle.dump(pipe, fp)
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def __init__(self, isTrain):
super(RegressionRandomForest, self).__init__(isTrain)
# data preprocessing
#self.dataPreprocessing()
# Create linear regression object
self.model = RandomForestRegressor(max_features='sqrt', n_estimators=32, max_depth=39)
def __init__(self, nr_events, case_id_col, encoder_kwargs, cls_kwargs, cls_method="rf"):
self.case_id_col = case_id_col
self.nr_events = nr_events
self.encoder = SequenceEncoder(nr_events=nr_events, case_id_col=case_id_col, **encoder_kwargs)
if cls_method == "gbm":
self.cls = GradientBoostingRegressor(**cls_kwargs)
elif cls_method == "rf":
self.cls = RandomForestRegressor(**cls_kwargs)
else:
print("Classifier method not known")
def fastLapModel(xList, labels, names, multiple=0, full_set=0):
X = numpy.array(xList)
y = numpy.array(labels)
featureNames = []
featureNames = numpy.array(names)
# take fixed holdout set 30% of data rows
xTrain, xTest, yTrain, yTest = train_test_split(
X, y, test_size=0.30, random_state=531)
# for final model (no CV)
if full_set:
xTrain = X
yTrain = y
check_set(xTrain, xTest, yTrain, yTest)
print "Fitting the model to the data set..."
# train random forest at a range of ensemble sizes in order to see how the
# mse changes
mseOos = []
m = 10 ** multiple
nTreeList = range(500 * m, 1000 * m, 100 * m)
# iTrees = 10000
for iTrees in nTreeList:
depth = None
maxFeat = int(np.sqrt(np.shape(xTrain)[1])) + 1 # try tweaking
RFmd = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
oob_score=False, random_state=531, n_jobs=-1)
# RFmd.n_features = 5
RFmd.fit(xTrain, yTrain)
# Accumulate mse on test set
prediction = RFmd.predict(xTest)
mseOos.append(mean_squared_error(yTest, prediction))
# plot training and test errors vs number of trees in ensemble
plot.plot(nTreeList, mseOos)
plot.xlabel('Number of Trees in Ensemble')
plot.ylabel('Mean Squared Error')
#plot.ylim([0.0, 1.1*max(mseOob)])
plot.show()
print("MSE")
print(mseOos[-1])
return xTrain, xTest, yTrain, yTest, RFmd
def fit_forest(X, y, window=100000, estimators=100,
samples_leaf=250, validate=True):
'''
Fits Random Forest
'''
model = RandomForestRegressor(n_estimators=estimators,
min_samples_leaf=samples_leaf,
random_state=42,
n_jobs=-1)
if validate:
return cross_validate(X, y, model, window)
return model.fit(X, y)
def __init__(self, **params):
"""
Wrapper around sklearn's Random Forest implementation for pyGPGO.
Random Forests can also be used for surrogate models in Bayesian Optimization.
An estimate of 'posterior' variance can be obtained by using the `impurity`
criterion value in each subtree.
Parameters
----------
params: tuple, optional
Any parameters to pass to `RandomForestRegressor`. Defaults to sklearn's.
"""
self.params = params
def generate_RF_model(file_name):
train_df = read_from_file(file_name)
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Train Random Forest Regression Model...'
start_time = datetime.datetime.now()
rf = RandomForestRegressor(n_estimators=25, n_jobs=-1)#, class_weight='balanced')
rf.fit(X,y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: '
print (end_time-start_time).seconds
print 'Save Model...'
joblib.dump(rf, 'RF.model')
return rf
def rf_from_cfg(cfg, seed):
"""
Creates a random forest regressor from sklearn and fits the given data on it.
This is the function-call we try to optimize. Chosen values are stored in
the configuration (cfg).
Parameters:
-----------
cfg: Configuration
configuration chosen by smac
seed: int or RandomState
used to initialize the rf's random generator
Returns:
-----------
np.mean(rmses): float
mean of root mean square errors of random-forest test predictions
per cv-fold
"""
rfr = RandomForestRegressor(
n_estimators=cfg["num_trees"],
criterion=cfg["criterion"],
min_samples_split=cfg["min_samples_to_split"],
min_samples_leaf=cfg["min_samples_in_leaf"],
min_weight_fraction_leaf=cfg["min_weight_frac_leaf"],
max_features=cfg["max_features"],
max_leaf_nodes=cfg["max_leaf_nodes"],
bootstrap=cfg["do_bootstrapping"],
random_state=seed)
def rmse(y, y_pred):
return np.sqrt(np.mean((y_pred - y)**2))
# Creating root mean square error for sklearns crossvalidation
rmse_scorer = make_scorer(rmse, greater_is_better=False)
score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer)
return -1 * np.mean(score) # Because cross_validation sign-flips the score
def train(self, x, y,
n_estimators=10,
max_depth=None,
min_samples_leaf=1):
n_estimators = self.to_int(n_estimators)
max_depth = self.to_int(max_depth)
min_samples_leaf = self.pos_int(min_samples_leaf)
if self.problem_type == ProblemType.BINARY_CLAS:
self.model = RandomForestClassifier(n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf)
elif self.problem_type == ProblemType.REGRESSION:
self.model = RandomForestRegressor(n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf)
else:
raise NotImplementedError('Problem type {0} not implemented'.format(self.problem_type))
self.model.fit(x, y)
def define_model(self):
#if self.modeltype == "AR" :
# return statsmodels.tsa.ar_model.AR(max_order=self.parameters['max_order'])
if self.modeltype == "RandomForest" :
return ensemble.RandomForestRegressor(n_estimators=self.parameters['n_estimators'])
#return ensemble.RandomForestClassifier(
# n_estimators=self.parameters['n_estimators'])
elif self.modeltype == "LinearRegression" :
return linear_model.LinearRegression()
elif self.modeltype == "Lasso" :
return linear_model.Lasso(
alpha=self.parameters['alpha'])
elif self.modeltype == "ElasticNet" :
return linear_model.ElasticNet(
alpha=self.parameters['alpha'],
l1_ratio=self.parameters['l1_ratio'])
elif self.modeltype == "SVR" :
return SVR(
C=self.parameters['C'],
epsilon=self.parameters['epsilon'],
kernel=self.parameters['kernel'])
#elif self.modeltype == 'StaticModel':
# return StaticModel (
# parameters=self.parameters
# )
#elif self.modeltype == 'AdvancedStaticModel':
# return AdvancedStaticModel (
# parameters=self.parameters
# )
# elif self.modeltype == 'SGDRegressor' :
# print(self.parameters)
# return linear_model.SGDRegressor(
# loss=self.parameters['loss'],
# penalty=self.parameters['penalty'],
# l1_ratio=self.parameters['l1_ratio'])
else:
raise ConfigError("Unsupported model {0}".format(self.modeltype))
def predict(self, X, return_std=False):
if return_std:
trees = self.estimators_
y = np.concatenate([tree.predict(X)[np.newaxis, :] for tree in trees], axis=0)
mean = y.mean(axis=0)
std = y.std(axis=0)
return mean, std
else:
return super(RandomForestRegressor, self).predict(X)
def greedy_elim(df):
# do feature selection using boruta
X = df[[x for x in df.columns if x!='SalePrice']]
y = df['SalePrice']
#model = RandomForestRegressor(n_estimators=50)
model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05)
# 150 features seems to be the best at the moment. Why this is is unclear.
feat_selector = RFE(estimator=model, step=1, n_features_to_select=150)
# find all relevant features
feat_selector.fit_transform(X.as_matrix(), y.as_matrix())
# check selected features
features_bool = np.array(feat_selector.support_)
features = np.array(X.columns)
result = features[features_bool]
#print(result)
# check ranking of features
features_rank = feat_selector.ranking_
#print(features_rank)
rank = features_rank[features_bool]
#print(rank)
return result
def model_fit_and_test(TrainX,TrainY,TestX,TestY):
def bulid_model(model_name):
model = model_name()
return model
#for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]:
for model_name in [LinearRegression, ElasticNet]:
model = bulid_model(model_name)
model.fit(TrainX,TrainY)
print(model_name)
resid = model.predict(TestX) - TestY
#print resid
print("Residual sum of squares: %f"% np.mean(resid ** 2))
#print model.predict(TestX)
#print TestY
# Explained variance score: 1 is perfect prediction
plt.scatter(model.predict(TestX), resid);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
#plt.xlim([1, 50])
plt.show()
print('Variance score: %.2f' % model.score(TestX, TestY))
from statsmodels.stats.stattools import jarque_bera
_, pvalue, _, _ = jarque_bera(resid)
print ("Test Residuals Normal", pvalue)
from statsmodels import regression, stats
import statsmodels.api as sms
import statsmodels.stats.diagnostic as smd
# xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4)))
xs_with_constant = sms.add_constant(TestX)
_, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant)
print ("Test Heteroskedasticity", pvalue1)
ljung_box = smd.acorr_ljungbox(resid, lags=10)
#print "Lagrange Multiplier Statistics:", ljung_box[0]
print "Test Autocorrelation P-values:", ljung_box[1]
if any(ljung_box[1] < 0.05):
print "The residuals are autocorrelated."
else:
print "The residuals are not autocorrelated."
def __init__(self, **params):
"""
Wrapper around sklearn's ExtraTreesRegressor implementation for pyGPGO.
Random Forests can also be used for surrogate models in Bayesian Optimization.
An estimate of 'posterior' variance can be obtained by using the `impurity`
criterion value in each subtree.
Parameters
----------
params: tuple, optional
Any parameters to pass to `RandomForestRegressor`. Defaults to sklearn's.
"""
self.params = params
def random_forest_grid_search():
random_forest_grid = {
'n_estimators': [50, 100, 1000],
'max_features': ['sqrt', 'log2', 'auto'],
'min_samples_split': [2, 4],
'min_samples_leaf': [1, 2],
}
rf = RandomForestRegressor()
return random_forest_grid, rf
def convert(model, feature_names, target):
"""Convert a boosted tree model to protobuf format.
Parameters
----------
decision_tree : RandomForestRegressor
A trained scikit-learn tree model.
feature_names: [str]
Name of the input columns.
target: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _ensemble.RandomForestRegressor)
def is_rf_model(m):
if len(m.estimators_) == 0:
return False
if hasattr(m, 'estimators_') and m.estimators_ is not None:
for t in m.estimators_:
if not hasattr(t, 'tree_') or t.tree_ is None:
return False
return True
else:
return False
_sklearn_util.check_fitted(model, is_rf_model)
return _MLModel(_convert_tree_ensemble(model, feature_names, target))
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
scikit_data = load_boston()
scikit_model = RandomForestRegressor(random_state = 1)
scikit_model.fit(scikit_data['data'], scikit_data['target'])
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model