def train_xgboost():
df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7')
p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
y=np.array([])
t=0
z=np.array([])
for ind in range(len(folder_names_train)):
try:
temp = df.get_value(str(folder_names_train[ind]),'Survival')
y=np.append(y,temp)
temp = df.get_value(str(folder_names_train[ind]),'Age')
z=np.append(z,np.array([temp]))
except Exception as e:
t+=1
print (t,str(e),"Label Not found, deleting entry")
y=np.append(y,0)
z=np.array([[v] for v in z])
t=np.concatenate((p,q),axis=1)
u=np.concatenate((r,s),axis=1)
x=np.concatenate((t,u),axis=1)
#print(x.shape)
#print (x)
#print (x.shape,z.shape)
x=np.concatenate((x,z),axis=1)
#print (x)
#clf=linear_model.LogisticRegression(C=1e5)
#clf = RandomForestRegressor()
clf = xgb.XGBRegressor()
clf.fit(x,y)
return clf
python类XGBRegressor()的实例源码
resnet_regressor.py 文件源码
项目:Brain_Tumor_Segmentation
作者: KarthikRevanuru
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
test_boosted_trees_regression_numeric.py 文件源码
项目:coremltools
作者: apple
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def _train_convert_evaluate(self, bt_params = {}, **params):
"""
Set up the unit test by loading the dataset and training a model.
"""
# Train a model
xgb_model = xgboost.XGBRegressor(**params)
xgb_model.fit(self.X, self.target)
# Convert the model (feature_names can't be given because of XGboost)
spec = xgb_converter.convert(xgb_model, self.feature_names, self.output_name, force_32bit_float = False)
# Get predictions
df = pd.DataFrame(self.X, columns=self.feature_names)
df['prediction'] = xgb_model.predict(self.X)
# Evaluate it
metrics = evaluate_regressor(spec, df, target = 'target', verbose = False)
return metrics
def xgb_model_select(file_name):
train_df = read_from_file(file_name)
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Select Model...'
start_time = datetime.datetime.now()
xgb_clf = xgb.XGBRegressor()
parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9]}
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.datetime.now()
print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def train_model_for_appcounts(df):
app_df = df[['appCount','age','gender','education','marriageStatus','haveBaby']]
known_app = app_df[app_df.appCount.notnull()].as_matrix()
unknown_app = app_df[app_df.appCount.isnull()].as_matrix()
y = known_app[:, 0]
X = known_app[:, 1:]
print 'Train Xgboost Model(For Missing AppCount)...'
start_time = datetime.datetime.now()
xgb_reg = XGBRegressor(n_estimators=100, max_depth=3)
xgb_reg.fit(X, y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)
predicted_app = xgb_reg.predict(unknown_app[:, 1:])
df.loc[ (df.appCount.isnull()), 'appCount' ] = predicted_app
return df, xgb_reg
def train_model_for_age(df):
age_df = df[['age', 'appCount','gender','education','marriageStatus','haveBaby']]
known_age = age_df[age_df.age != 0].as_matrix()
unknown_age = age_df[age_df.age == 0].as_matrix()
y = known_age[:, 0]
X = known_age[:, 1:]
print 'Train Xgboost Model(For Missing Age)...'
start_time = datetime.datetime.now()
xgb_reg = XGBRegressor(n_estimators=100, max_depth=3)
xgb_reg.fit(X, y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)
predicted_age = xgb_reg.predict(unknown_age[:, 1:])
df.loc[ (df.age == 0), 'age' ] = predicted_age
return df, xgb_reg
def generate_XGB_model(train_df):
train_df.drop(['conversionTime'], axis=1, inplace=True)
print 'Train And Fix Missing App Count Value...'
train_df, xgb_appcount = train_model_for_appcounts(train_df)
joblib.dump(xgb_appcount, 'XGB_missing.model')
'''print 'Train And Fix Missing Age Value...'
train_df, xgb_age = train_model_for_age(train_df)
joblib.dump(xgb_age, 'XGB_age.model')'''
train_df.drop(['marriageStatus','haveBaby','sitesetID', 'positionType'], axis=1, inplace=True)
print 'Done'
print train_df.info()
print train_df.describe()
print train_df.isnull().sum()
train_np = train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Train Xgboost Model...'
start_time = datetime.datetime.now()
xbg_clf = XGBRegressor(n_estimators=100, max_depth=6, objective="binary:logistic", silent=False)
xbg_clf.fit(X,y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)
model_df = pd.DataFrame({'columns':list(train_df.columns)[1:], 'values':xbg_clf.feature_importances_})
print model_df
return xbg_clf
def train_xgboost():
df = pd.read_csv('data/stage1_labels.csv')
print(df.head())
x = np.array([np.mean(np.load('npy_result/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
y = df['cancer'].as_matrix()
trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
test_size=0.20)
clf = xgb.XGBRegressor(max_depth=10,
n_estimators=1500,
min_child_weight=9,
learning_rate=0.05,
nthread=8,
subsample=0.80,
colsample_bytree=0.80,
seed=4242)
clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=50)
return clf
def try_params( n_iterations, params, get_predictions = False ):
n_estimators = int( round( n_iterations * trees_per_iteration ))
print "n_estimators:", n_estimators
pprint( params )
model = XGB( n_estimators = n_estimators, nthread = -1, **params )
return train_and_eval_sklearn_regressor( model, data )
def print_results(self, model_name):
if self.ml_for_analytics and model_name in ('LogisticRegression', 'RidgeClassifier', 'LinearRegression', 'Ridge'):
self._print_ml_analytics_results_linear_model()
elif self.ml_for_analytics and model_name in ['RandomForestClassifier', 'RandomForestRegressor', 'XGBClassifier', 'XGBRegressor', 'GradientBoostingRegressor', 'GradientBoostingClassifier', 'LGBMRegressor', 'LGBMClassifier']:
self._print_ml_analytics_results_random_forest()
def _get_xgb_feat_importances(self, clf):
try:
# Handles case when clf has been created by calling
# xgb.XGBClassifier.fit() or xgb.XGBRegressor().fit()
fscore = clf.booster().get_fscore()
except:
# Handles case when clf has been created by calling xgb.train.
# Thus, clf is an instance of xgb.Booster.
fscore = clf.get_fscore()
trained_feature_names = self._get_trained_feature_names()
feat_importances = []
# Somewhat annoying. XGBoost only returns importances for the features it finds useful.
# So we have to go in, get the index of the feature from the "feature name" by removing the f before the feature name, and grabbing the rest of that string, which is actually the index of that feature name.
fscore_list = [[int(k[1:]), v] for k, v in fscore.items()]
feature_infos = []
sum_of_all_feature_importances = 0.0
for idx_and_result in fscore_list:
idx = idx_and_result[0]
# Use the index that we grabbed above to find the human-readable feature name
feature_name = trained_feature_names[idx]
feat_importance = idx_and_result[1]
# If we sum up all the feature importances and then divide by that sum, we will be able to have each feature importance as it's relative feature imoprtance, and the sum of all of them will sum up to 1, just as it is in scikit-learn.
sum_of_all_feature_importances += feat_importance
feature_infos.append([feature_name, feat_importance])
sorted_feature_infos = sorted(feature_infos, key=lambda x: x[1])
print('Here are the feature_importances from the tree-based model:')
print('The printed list will only contain at most the top 50 features.')
for feature in sorted_feature_infos[-50:]:
print(str(feature[0]) + ': ' + str(round(feature[1] / sum_of_all_feature_importances, 4)))
def _print_ml_analytics_results_random_forest(self):
try:
final_model_obj = self.trained_final_model.named_steps['final_model']
except:
final_model_obj = self.trained_final_model
print('\n\nHere are the results from our ' + final_model_obj.model_name)
if self.name is not None:
print(self.name)
print('predicting ' + self.output_column)
# XGB's Classifier has a proper .feature_importances_ property, while the XGBRegressor does not.
if final_model_obj.model_name in ['XGBRegressor', 'XGBClassifier']:
self._get_xgb_feat_importances(final_model_obj.model)
else:
trained_feature_names = self._get_trained_feature_names()
try:
trained_feature_importances = final_model_obj.model.feature_importances_
except AttributeError as e:
# There was a version of LightGBM that had this misnamed to miss the "s" at the end
trained_feature_importances = final_model_obj.model.feature_importance_
feature_infos = zip(trained_feature_names, trained_feature_importances)
sorted_feature_infos = sorted(feature_infos, key=lambda x: x[1])
print('Here are the feature_importances from the tree-based model:')
print('The printed list will only contain at most the top 50 features.')
for feature in sorted_feature_infos[-50:]:
print(feature[0] + ': ' + str(round(feature[1], 4)))
def setClf(self):
self.clf = XGBRegressor(max_depth=7, learning_rate=0.01, n_estimators=100)
return
def create_features(user_id,is_exp,
feature_cloumn_func = lambda day:get_feature_cloumn(None,day,has_user_type=False),
load_exp_func = load_user_exp_model,
load_func = load_user_model,
is_exp_power = False
):
print user_id
dataset = get_month_by_id(user_id)
result = []
for day in range(1,32):
feature_column = feature_cloumn_func(day)
x_ = dataset[feature_column]
trainer = xgb.XGBRegressor()
if is_exp:
if is_exp_power:
x_ = exp_power(x_)
load_exp_func(trainer,day,user_id)
else:
load_func(trainer,day,user_id)
y_p = trainer.predict(x_)
y_p = pd.Series(y_p,name='y_p#%d'%(day-1))
if not is_exp:
y_p = np.exp(y_p)
result.append(y_p)
result = pd.DataFrame(result).T
result.index = dataset.index
for day in range(31):
result['real#%d'%day] = dataset['y#%d'%day].apply(np.exp)
sys.stdout.flush()
return result
def train_xgboost_regressor():
return mp.ModelProperties(regression=True), xgboost.XGBRegressor()
def apply_filler(self, x_train, y_train, x_test):
model = xgboost.XGBRegressor()
model = model.fit(x_train, y_train)
return model.predict(x_test)
def test_unsupported_conversion(self):
feature_names = self.scikit_data.feature_names
output_name = 'target'
xgb_model = xgboost.XGBRegressor(objective = 'reg:gamma')
xgb_model.fit(self.scikit_data.data, self.scikit_data.target)
with self.assertRaises(ValueError):
spec = xgb_converter.convert(xgb_model, feature_names, 'target')
xgb_model = xgboost.XGBRegressor(objective = 'reg:tweedie')
xgb_model.fit(self.scikit_data.data, self.scikit_data.target)
with self.assertRaises(ValueError):
spec = xgb_converter.convert(xgb_model, feature_names, 'target')
def test():
iris = load_iris()
xgb_model = xgb.XGBRegressor(n_estimators=300000, max_depth=2)
xgb_model.fit(iris.data[:120],iris.target[:120])
predict = xgb_model.predict(iris.data[:120])
print mean_squared_error(iris.target[:120], predict)
pred = xgb_model.predict(iris.data[120:])
print mean_squared_error(iris.target[120:], pred)
def regression_with_xgboost(x_train, y_train, X_test, Y_test, features=None, use_cv=True, use_sklean=False, xgb_params=None):
train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan'))
test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan'))
evallist = [(test_data,'eval'), (train_data,'train')]
#if xgb_params == None:
# xgb_params = get_default_xgboost_params()
if not use_cv:
num_rounds = 10
else:
cvresult = xgb.cv(xgb_params, train_data, num_boost_round=100, nfold=5,
metrics={'rmse'}, show_progress=True)
print cvresult
num_rounds = len(cvresult)
gbdt = None
if(use_sklean):
#gbdt = xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None)
xgb_params['n_estimators'] = num_rounds
gbdt = xgboost.XGBRegressor(xgb_params)
gbdt.fit(x_train, y_train)
y_pred = gbdt.predict(X_test)
return gbdt, y_pred
else:
#gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5)
gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True)
ceate_feature_map_for_feature_importance(features)
show_feature_importance(gbdt, feature_names=features)
y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float("nan")))
return XGBoostModel(gbdt), y_pred
def train_xgboost():
df = pd.read_csv('data/stage1_labels.csv')
# print df.head()
x = []
y = []
did = df['id'].tolist()
cancer = df['cancer'].tolist()
for i in range(len(df)):
if os.path.isfile('data/stage1/%s.npy' % did[i]):
f = np.load('data/stage1/%s.npy' % did[i])
f = f.reshape(f.shape[0], 2048)
x.append(np.mean(f, axis=0))
y.append(cancer[i])
x = np.array(x)
print x.shape
y = np.array(y)
trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=822, stratify=y, test_size=0.1)
clfs = []
for s in range(5):
# Some parameters were taken from discussion.
clf = xgb.XGBRegressor(n_estimators=1000, max_depth=10, min_child_weight=10,
learning_rate=0.01, subsample=0.80, colsample_bytree=0.70,
seed=822 + s, reg_alpha=0.1)
clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=100)
clfs.append(clf)
return clfs
def xgbr(X,y):
X_train,X_validation,y_train,y_validation = train_test_split(X,y,random_state=0)
xgbr_boost = xgb.XGBRegressor(seed=1)
xgbr_boost.fit(X_train,y_train.ravel())
print 'training error:',1.0 - xgbr_boost.score(X_train,y_train)
print 'validation error:',1.0 - xgbr_boost.score(X_validation,y_validation)
time_fit(xgbr_boost,X_train,y_train.ravel())
def __init__(self):
self.name = "onegbm"
self.m = Pipeline([
("drop", FeatureRemover(["UPBS", "UPBE", "SCMN", "earthmars_km", "OCC_MARS_200KM_START_", "sa_monthly"])),
("gbm", xgboost.XGBRegressor(max_depth=7, n_estimators=1000, learning_rate=0.05, silent=1, seed=42))
])
def models():
params = {'n_jobs':nthread,'random_state':seed,'class_weight':None}
# extra = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features='auto',criterion= 'entropy',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
# extra1 = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=60,criterion= 'gini',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)
# rf = ensemble.RandomForestClassifier(n_estimators=1000,max_features= 'auto',criterion= 'gini',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
# rf1 = ensemble.RandomForestClassifier(n_estimators=1000,max_features=60,criterion= 'entropy',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)
# xgb_binlog = XGBClassifier(objective="binary:logistic" ,max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
# xgb_reglog = XGBClassifier(objective="reg:logistic", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
# xgb_poi = XGBClassifier(objective="count:poisson", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
# xgb_reglin = XGBClassifier(objective="reg:linear", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
rf_params = {'n_estimators':850,'max_features':60,'criterion':'entropy','min_samples_split': 4,'max_depth': 40, 'min_samples_leaf': 2, 'n_jobs': -1}
clfs = [
# (D1, XGBRegressor(objective="reg:linear", max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
(D1, XGBClassifier(objective="binary:logistic" ,max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
# (D1, XGBRegressor(objective="reg:linear", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
# (D1,XGBClassifier(objective="binary:logistic", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
# (D1, XGBRegressor(objective="reg:linear", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
# (D1,XGBClassifier(objective="binary:logistic", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
]
for clf in clfs:
yield clf
def test_regressor(loop): # noqa
with cluster() as (s, [a, b]):
with Client(s['address'], loop=loop):
a = dxgb.XGBRegressor()
X2 = da.from_array(X, 5)
y2 = da.from_array(y, 5)
a.fit(X2, y2)
p1 = a.predict(X2)
b = xgb.XGBRegressor()
b.fit(X, y)
assert_eq(p1, b.predict(X))
def make_model(params):
return xgb.XGBRegressor(**params)
def xgb_Fit(knownX,knownY,preX):
xlf = xgb.XGBRegressor(max_depth=11,
learning_rate=0.01,
n_estimators=301,
silent=True,
objective=mape,
gamma=0,
min_child_weight=5,
max_delta_step=0,
subsample=0.8,
colsample_bytree=0.8,
colsample_bylevel=1,
reg_alpha=1e0,
reg_lambda=0,
scale_pos_weight=1,
seed=9,
missing=None)
x_train, x_test, y_train, y_test = train_test_split(knownX, knownY, test_size=0.5, random_state=1)
for i in range(y_train.shape[1]):
xlf.fit(x_train, y_train[:, i].reshape(-1, 1), eval_metric=mape, verbose=False)
# eval_set=[(x_test, y_test[:, i].reshape(-1, 1))], early_stopping_rounds=2)
tempPre = xlf.predict(preX).reshape(-1, 1)
if i == 0:
Y_pre = tempPre
else:
Y_pre = np.c_[Y_pre, tempPre]
Y_pre = Y_pre.reshape(-1, 1)
return Y_pre
#?model??gridsearch
def xgb_Fit(knownX,knownY,preX):
xlf = xgb.XGBRegressor(max_depth=7,#11
learning_rate=0.06,#0.01
n_estimators=1000,
silent=True,
objective=mapeobj,
gamma=0,
min_child_weight=5,
max_delta_step=0,
subsample=1,#0.8
colsample_bytree=0.8,
colsample_bylevel=1,
reg_alpha=1e0,
reg_lambda=0,
scale_pos_weight=1,
seed=1850,
missing=None)
x_train, x_test, y_train, y_test = train_test_split(knownX, knownY, test_size=0.5, random_state=1)
for i in range(y_train.shape[1]):
xlf.fit(x_train, y_train[:, i].reshape(-1,1))
# print('Training Error: {:.3f}'.format(1 - xlf.score(x_train,y_train[:,i].reshape(-1,1))))
# print('Validation Error: {:.3f}'.format(1 - xlf.score(x_test,y_test[:,i].reshape(-1,1))))
#predict value for output
tempPre = xlf.predict(preX).reshape(-1, 1)
if i == 0:
Y_pre = tempPre
else:
Y_pre = np.c_[Y_pre, tempPre]
Y_pre = Y_pre.reshape(-1, 1)
return Y_pre
#sklearn???????
def svr_main(X, Y):
X_train = X[:TRAIN_SIZE]
Y_train = Y[:TRAIN_SIZE]
X_test = X[TRAIN_SIZE:]
Y_test = Y[TRAIN_SIZE:]
clf = SVR(kernel='rbf', C=1e3, gamma=0.00001)
#clf.fit(X_train,Y_train)
#y_pred = clf.predict(X_test)
#plt.plot(X_test, y_pred, linestyle='-', color='red')
#clf = GradientBoostingRegressor(n_estimators=100,max_depth=1)
#clf = DecisionTreeRegressor(max_depth=25)
#clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14)
#clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25)
#clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7)
predict_list = []
for i in xrange(TEST_SIZE):
X = [ [x] for x in xrange(i, TRAIN_SIZE+i)]
clf.fit(X, Y[i:TRAIN_SIZE+i])
y_pred = clf.predict([TRAIN_SIZE+1+i])
predict_list.append(y_pred)
print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list)
print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list))
origin_data = Y_test
print "origin data:%s"%origin_data
plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model')
plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model')
plt.legend(loc=1, prop={'size': 12})
plt.show()
xgradient_boosting.py 文件源码
项目:AutoML-Challenge
作者: postech-mlg-exbrain
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def fit(self, X, y, refit=False):
import xgboost as xgb
self.learning_rate = float(self.learning_rate)
self.n_estimators = int(self.n_estimators)
self.subsample = float(self.subsample)
self.max_depth = int(self.max_depth)
# (TODO) Gb used at most half of the features, here we use all
self.colsample_bylevel = float(self.colsample_bylevel)
self.colsample_bytree = float(self.colsample_bytree)
self.gamma = float(self.gamma)
self.min_child_weight = int(self.min_child_weight)
self.max_delta_step = int(self.max_delta_step)
self.reg_alpha = float(self.reg_alpha)
self.reg_lambda = float(self.reg_lambda)
self.nthread = int(self.nthread)
self.base_score = float(self.base_score)
self.scale_pos_weight = float(self.scale_pos_weight)
self.objective = 'reg:linear'
self.estimator = xgb.XGBRegressor(
max_depth=self.max_depth,
learning_rate=self.learning_rate,
n_estimators=self.n_estimators,
silent=self.silent,
objective=self.objective,
nthread=self.nthread,
gamma=self.gamma,
scale_pos_weight=self.scale_pos_weight,
min_child_weight=self.min_child_weight,
max_delta_step=self.max_delta_step,
subsample=self.subsample,
colsample_bytree=self.colsample_bytree,
colsample_bylevel=self.colsample_bylevel,
reg_alpha=self.reg_alpha,
reg_lambda=self.reg_lambda,
base_score=self.base_score,
seed=self.seed
)
self.estimator.fit(X, y)
return self
def models():
extra_params_kaggle_cla = {'n_estimators':1200,'max_features':30,'criterion':'entropy',
'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}
extra_params_kaggle_reg = {'n_estimators':1200,'max_features':30,'criterion':'mse',
'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}
xgb_reg = {'objective':'reg:linear', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}
xgb_cla = {'objective':'binary:logistic', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}
#NN params
nb_epoch = 3
batch_size = 128
esr = 402
param1 = {
'hidden_units': (256, 256),
'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
}
param2 = {
'hidden_units': (1024, 1024),
'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
}
clfs = [
(D2, XGBClassifier(**xgb_cla)),
(D11, XGBClassifier(**xgb_cla)),
(D2, XGBRegressor(**xgb_reg)),
(D11, XGBRegressor(**xgb_reg)),
(D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
(D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
(D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
(D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
# (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)),
# (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
# (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
#
# (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
# (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
# (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2))
]
for clf in clfs:
yield clf
def tune_xgb_params_segment_by_grid(estimator_cls: Type[Union[xgb.XGBClassifier, xgb.XGBRegressor]],
label: np.ndarray,
metric_sklearn: str,
n_jobs: int,
param_grid: dict,
params: dict,
strat_folds: StratifiedKFold,
train: np.ndarray,
verbosity_level: int = 10) -> Tuple[dict, float]:
"""
Grid search over a segment of XGBoost parameters.
:param estimator_cls:
The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
:param label:
An array-like containing the labels of the classification or regression problem.
:param metric_sklearn:
The evaluation metric to be passed to scikit-learn's GridSearchCV - see
http://scikit-learn.org/stable/modules/model_evaluation.html
for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
:param n_jobs:
The number of jobs to run simultaneously.
:param param_grid:
A dictionary of the grid of parameters to be searched over - e.g. {'colsample_bytree': range(0.5, 0.9, 0.1)} to search
values [0.5, 0.6, 0.7, 0.8].
:param params:
A dictionary of XGB parameters.
:param strat_folds:
A StratifiedKFold object to cross validate the parameters.
:param train:
An array-like containing the training input samples.
:param verbosity_level:
An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
:return:
A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
"""
params_copy = clean_params_for_sk(params)
grid = GridSearchCV(
cv=strat_folds.split(train, label),
estimator=estimator_cls(**params_copy),
n_jobs=n_jobs,
param_grid=param_grid,
scoring=metric_sklearn,
verbose=verbosity_level
)
grid.fit(train, label)
best_score = grid.best_score_
# Massage the score to be in line with what xgboost reports
if metric_sklearn == 'neg_mean_squared_error':
best_score = abs(best_score) ** 0.5
elif metric_sklearn == 'neg_log_loss':
best_score = abs(best_score)
return {k: grid.best_params_[k] for k in param_grid.keys()}, best_score