def cross_validate_best_known():
'''
import and clean the tractor data, then do a corss validation on each of the three models we are
training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print
the scores.
The parameters we're using here are the "best" that we've found so far using a grid search.
'''
tractor_data = pd.read_csv('data/train.csv')
tractor_data = cln.clean_all(tractor_data)
X = tractor_data
y = tractor_data.pop('SalePrice')
rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2)
gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1)
ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3)
ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000)
validate.cross_v_scores([rf, gb, ab], X, y)
# RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946
# GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829
# AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393
python类RandomForestRegressor()的实例源码
def test_stacked_regressor(self):
bclf = LinearRegression()
clfs = [RandomForestRegressor(n_estimators=50, random_state=1),
GradientBoostingRegressor(n_estimators=25, random_state=1),
Ridge(random_state=1)]
# Friedman1
X, y = datasets.make_friedman1(n_samples=1200,
random_state=1,
noise=1.0)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
sr = StackedRegressor(bclf,
clfs,
n_folds=3,
verbose=0,
oob_score_flag=True)
sr.fit(X_train, y_train)
mse = mean_squared_error(y_test, sr.predict(X_test))
assert_less(mse, 6.0)
def test_fwls_regressor(self):
feature_func = lambda x: np.ones(x.shape)
bclf = LinearRegression()
clfs = [RandomForestRegressor(n_estimators=50, random_state=1),
GradientBoostingRegressor(n_estimators=25, random_state=1),
Ridge(random_state=1)]
# Friedman1
X, y = datasets.make_friedman1(n_samples=1200,
random_state=1,
noise=1.0)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
sr = FWLSRegressor(bclf,
clfs,
feature_func,
n_folds=3,
verbose=0,
oob_score_flag=True)
sr.fit(X_train, y_train)
mse = mean_squared_error(y_test, sr.predict(X_test))
assert_less(mse, 6.0)
def get_feature_selection_model_from_name(type_of_estimator, model_name):
model_map = {
'classifier': {
'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'KeepAll': 'KeepAll'
},
'regressor': {
'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'KeepAll': 'KeepAll'
}
}
return model_map[type_of_estimator][model_name]
def model_random_forecast(Xtrain,Xtest,ytrain):
X_train = Xtrain
y_train = ytrain
rfr = RandomForestRegressor(n_jobs=1, random_state=0)
param_grid = {'n_estimators': [1000]}
# 'n_estimators': [1000], 'max_features': [10,15,20,25], 'max_depth':[20,20,25,25,]}
model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
model.fit(X_train, y_train)
print('Random forecast regression...')
print('Best Params:')
print(model.best_params_)
print('Best CV Score:')
print(-model.best_score_)
y_pred = model.predict(Xtest)
return y_pred, -model.best_score_
def rfr_feature_select():
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score, ShuffleSplit
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor(n_estimators=20, max_depth=4)
scores = []
for i in range(X.shape[1]):
score = cross_val_score(rf, X[:, i:i + 1],
Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
scores.append((round(np.mean(score), 3), names[i]))
print sorted(scores, reverse=True)
def get_feature_selection_model_from_name(type_of_estimator, model_name):
model_map = {
'classifier': {
'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'RandomizedSparse': RandomizedLogisticRegression(),
'KeepAll': 'KeepAll'
},
'regressor': {
'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'RandomizedSparse': RandomizedLasso(),
'KeepAll': 'KeepAll'
}
}
return model_map[type_of_estimator][model_name]
def rforest2(train, test, tunings=None, smoteit=True, duplicate=True):
"RF "
# Apply random forest Classifier to predict the number of bugs.
if smoteit:
train = SMOTE(train, atleast=50, atmost=101, resample=duplicate)
if not tunings:
clf = RandomForestRegressor(n_estimators=100, random_state=1)
else:
clf = RandomForestRegressor(n_estimators=int(tunings[0]),
max_features=tunings[1] / 100,
min_samples_leaf=int(tunings[2]),
min_samples_split=int(tunings[3])
)
train_DF = formatData(train)
test_DF = formatData(test)
features = train_DF.columns[:-2]
klass = train_DF[train_DF.columns[-2]]
# set_trace()
clf.fit(train_DF[features], klass)
preds = clf.predict(test_DF[test_DF.columns[:-2]])
return preds
def test_regressor(self):
X, y = datasets.make_friedman1(n_samples=1200,
random_state=1,
noise=1.0)
X_train, y_train = X[:200], y[:200]
index = [i for i in range(200)]
rf = RandomForestRegressor()
jrf = JoblibedRegressor(rf, "rfr", cache_dir='')
jrf.fit(X_train, y_train, index)
prediction = jrf.predict(X_train, index)
mse = mean_squared_error(y_train, prediction)
assert_less(mse, 6.0)
rf = RandomForestRegressor(n_estimators=20)
jrf = JoblibedRegressor(rf, "rfr", cache_dir='')
jrf.fit(X_train, y_train, index)
prediction2 = jrf.predict(X_train, index)
assert_allclose(prediction, prediction2)
def unscaled_pipelines():
# Random forest parameters
random_forest_kwargs = {
'n_estimators': 10,
'criterion': 'mse',
'random_state': _RANDOM_STATE,
'n_jobs': cpu_count(),
'verbose': True,
}
# Gradient boosting parameters
gradient_boost_kwargs = {
'random_state': _RANDOM_STATE,
'verbose': 1,
}
models = [
DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE),
# RandomForestRegressor(**random_forest_kwargs),
# GradientBoostingRegressor(**gradient_boost_kwargs),
]
pipelines = []
for m in models:
# Steps
pipelines.append(make_pipeline(m))
return pipelines
def parameterChoosing(self):
#Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(20,60),
'n_estimators': range(10,40),
'max_features': ['sqrt', 'log2', None]
}
]
clf = GridSearchCV(RandomForestRegressor(n_estimators=30), tuned_parameters, cv=5, scoring='mean_squared_error')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "MSE for test data set:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print mean_squared_error(y_true, y_pred)
resnet_regressor.py 文件源码
项目:Brain_Tumor_Segmentation
作者: KarthikRevanuru
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def train_xgboost():
df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7')
p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
y=np.array([])
t=0
z=np.array([])
for ind in range(len(folder_names_train)):
try:
temp = df.get_value(str(folder_names_train[ind]),'Survival')
y=np.append(y,temp)
temp = df.get_value(str(folder_names_train[ind]),'Age')
z=np.append(z,np.array([temp]))
except Exception as e:
t+=1
print (t,str(e),"Label Not found, deleting entry")
y=np.append(y,0)
z=np.array([[v] for v in z])
t=np.concatenate((p,q),axis=1)
u=np.concatenate((r,s),axis=1)
x=np.concatenate((t,u),axis=1)
#print(x.shape)
#print (x)
#print (x.shape,z.shape)
x=np.concatenate((x,z),axis=1)
#print (x)
#clf=linear_model.LogisticRegression(C=1e5)
#clf = RandomForestRegressor()
clf = xgb.XGBRegressor()
clf.fit(x,y)
return clf
def fit(self, X, y):
"""
Fit a Random Forest model to data `X` and targets `y`.
Parameters
----------
X : array-like
Input values.
y: array-like
Target values.
"""
self.X = X
self.y = y
self.n = self.X.shape[0]
self.model = RandomForestRegressor(**self.params)
self.model.fit(X, y)
def test_random_forest_regressor(self):
for dtype in self.number_data_type.keys():
scikit_model = RandomForestRegressor(random_state=1)
data = self.scikit_data['data'].astype(dtype)
target = self.scikit_data['target'].astype(dtype)
scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target)
test_data = data[0].reshape(1, -1)
self._check_tree_model(spec, 'multiArrayType', 'doubleType', 1)
coreml_model = create_model(spec)
try:
self.assertEqual(scikit_model.predict(test_data)[0].dtype,
type(coreml_model.predict({'data': test_data})['target']))
self.assertAlmostEqual(scikit_model.predict(test_data)[0],
coreml_model.predict({'data': test_data})['target'],
msg="{} != {} for Dtype: {}".format(
scikit_model.predict(test_data)[0],
coreml_model.predict({'data': test_data})['target'],
dtype
)
)
except RuntimeError:
print("{} not supported. ".format(dtype))
test_random_forest_regression_numeric.py 文件源码
项目:coremltools
作者: apple
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def _train_convert_evaluate(self, **scikit_params):
"""
Train a scikit-learn model, convert it and then evaluate it with CoreML
"""
scikit_model = RandomForestRegressor(random_state = 1, **scikit_params)
scikit_model.fit(self.X, self.target)
# Convert the model
spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name)
# Get predictions
df = pd.DataFrame(self.X, columns=self.feature_names)
df['prediction'] = scikit_model.predict(self.X)
# Evaluate it
metrics = evaluate_regressor(spec, df, verbose = False)
return metrics
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
def train_model(self, train_file_path, model_path):
print("==> Load the data ...")
X_train, Y_train = self.load_file(train_file_path)
print(train_file_path, shape(X_train))
print("==> Train the model ...")
min_max_scaler = preprocessing.MaxAbsScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
clf = RandomForestRegressor(n_estimators=self.n_estimators)
clf.fit(X_train_minmax.toarray(), Y_train)
print("==> Save the model ...")
pickle.dump(clf, open(model_path, 'wb'))
scaler_path = model_path.replace('.pkl', '.scaler.pkl')
pickle.dump(min_max_scaler, open(scaler_path, 'wb'))
return clf
def trainModel(featureCount, imageCount, save):
clf = RandomForestRegressor(n_estimators=1, n_jobs=-1)
features = generateFeatures(featureCount)
for image in range(0, imageCount):
print "Image " + str(image)
train(clf, features, image)
clf = clf.fit(X, Y)
model = (clf, features)
if save:
joblib.dump(model, "model.pkl")
return model
def set_missing_ages(df):
age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
y = known_age[:, 0]
X = known_age[:, 1:]
# fit by RamdomForestRegressor
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# predict the unknown age
predictedAges = rfr.predict(unknown_age[:, 1:])
# backfill the value of unknown age
df.loc[(df.Age.isnull()), 'Age'] = predictedAges
return df, rfr
def set_missing_ages(df):
age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
y = known_age[:, 0]
X = known_age[:, 1:]
# fit by RamdomForestRegressor
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# predict the unknown age
predictedAges = rfr.predict(unknown_age[:, 1:])
# backfill the value of unknown age
df.loc[(df.Age.isnull()), 'Age'] = predictedAges
return df, rfr
# processing the column : Cabin
def test_gbrt_base_estimator():
rng = np.random.RandomState(1)
N = 10000
X = np.ones((N, 1))
y = rng.normal(size=N)
base = RandomForestRegressor()
rgr = GradientBoostingQuantileRegressor(base_estimator=base)
assert_raise_message(ValueError, 'type GradientBoostingRegressor',
rgr.fit, X, y)
base = GradientBoostingRegressor()
rgr = GradientBoostingQuantileRegressor(base_estimator=base)
assert_raise_message(ValueError, 'quantile loss', rgr.fit, X, y)
base = GradientBoostingRegressor(loss='quantile', n_estimators=20)
rgr = GradientBoostingQuantileRegressor(base_estimator=base)
rgr.fit(X, y)
estimates = rgr.predict(X, return_quantiles=True)
assert_almost_equal(stats.norm.ppf(rgr.quantiles),
np.mean(estimates, axis=0),
decimal=2)
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 300
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
print('step %d of %d'%(n+1, skf.n_splits), now())
clf = ensemble.RandomForestRegressor(n_estimators=1000,
max_depth=3,
random_state=13)
clf.fit(train2[itrain], y[itrain])
p = clf.predict(train2[ival])
v.loc[ival, cname] += p
score = metrics.log_loss(y[ival], p)
z[cname] += np.log1p(clf.predict(test2))
print(cname, 'step %d: score'%(n+1), score, now())
scores.append(score)
print('validation loss: ', metrics.log_loss(y, v[cname]))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= N_splits
def test_RandomForestRegressor_num(*data):
'''
test the performance with different n_estimators
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
nums=np.arange(1,100,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for num in nums:
regr=ensemble.RandomForestRegressor(n_estimators=num)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(nums,training_scores,label="Training Score")
ax.plot(nums,testing_scores,label="Testing Score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1)
plt.suptitle("RandomForestRegressor")
plt.show()
def test_RandomForestRegressor_max_depth(*data):
'''
test the performance with different max_depth
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
maxdepths=range(1,20)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for max_depth in maxdepths:
regr=ensemble.RandomForestRegressor(max_depth=max_depth)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(maxdepths,training_scores,label="Training Score")
ax.plot(maxdepths,testing_scores,label="Testing Score")
ax.set_xlabel("max_depth")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1.05)
plt.suptitle("RandomForestRegressor")
plt.show()
def test_RandomForestRegressor_max_features(*data):
'''
test the performance with different max_features
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
max_features=np.linspace(0.01,1.0)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for max_feature in max_features:
regr=ensemble.RandomForestRegressor(max_features=max_feature)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(max_features,training_scores,label="Training Score")
ax.plot(max_features,testing_scores,label="Testing Score")
ax.set_xlabel("max_feature")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1.05)
plt.suptitle("RandomForestRegressor")
plt.show()
def rf(train_sample, validation_sample, features, seed):
log_base = np.e
rf_est = RandomForestRegressor(n_estimators=500,
criterion='mse',
max_features=4,
max_depth=None,
bootstrap=True,
min_samples_split=4,
min_samples_leaf=1,
min_weight_fraction_leaf=0,
max_leaf_nodes=None,
random_state=seed
).fit(
train_sample[features], np.log1p(train_sample['volume']) / np.log(log_base))
rf_prob = np.power(log_base, rf_est.predict(validation_sample[features])) - 1
print_mape(validation_sample['volume'], rf_prob, 'RF')
return rf_prob
def test_check_consistent_length():
check_consistent_length([1], [2], [3], [4], [5])
check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b'])
check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2)))
assert_raises_regexp(ValueError, 'inconsistent numbers of samples',
check_consistent_length, [1, 2], [1])
assert_raises_regexp(TypeError, 'got <\w+ \'int\'>',
check_consistent_length, [1, 2], 1)
assert_raises_regexp(TypeError, 'got <\w+ \'object\'>',
check_consistent_length, [1, 2], object())
assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1))
# Despite ensembles having __len__ they must raise TypeError
assert_raises_regexp(TypeError, 'estimator', check_consistent_length,
[1, 2], RandomForestRegressor())
# XXX: We should have a test with a string, but what is correct behaviour?
def try_params( n_iterations, params ):
n_estimators = int( round( n_iterations * trees_per_iteration ))
print "n_estimators:", n_estimators
pprint( params )
clf = RF( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params )
return train_and_eval_sklearn_regressor( clf, data )
def __init__(self, task: Task, scorer: Scorer, opt_logger: OptimizationLogger=VoidLogger(None)):
if task.task == "classification":
space = RandomForestOptimizer.Params.classification_space
model = ensemble.RandomForestClassifier()
else:
space = RandomForestOptimizer.Params.regression_space
model = ensemble.RandomForestRegressor()
super().__init__(model, task, space, scorer, opt_logger)
def setClf(self):
# min_samples_split = 3
# self.clf = RandomForestRegressor(n_estimators = 100, max_features = 0.3, min_samples_split =1, verbose=100, n_jobs=-1)
self.clf = RandomForestRegressor(n_estimators = 100, max_features = 0.8)
return