def xgb_model(train_data, train_label, test_data, test_label):
clf = xgb.XGBClassifier(max_depth=7,
min_child_weight=1,
learning_rate=0.1,
n_estimators=500,
silent=True,
objective='binary:logistic',
gamma=0,
max_delta_step=0,
subsample=1,
colsample_bytree=1,
colsample_bylevel=1,
reg_alpha=0,
reg_lambda=0,
scale_pos_weight=1,
seed=1,
missing=None)
clf.fit(train_data, train_label, eval_metric='auc', verbose=True,
eval_set=[(test_data, test_label)], early_stopping_rounds=100)
y_pre = clf.predict(test_data)
y_pro = clf.predict_proba(test_data)[:, 1]
#print "AUC Score : %f" % metrics.roc_auc_score(test_label, y_pro)
#print"Accuracy : %.4g" % metrics.accuracy_score(test_label, y_pre)
return clf
python类XGBClassifier()的实例源码
def __init__(self, a_clf=None, a_grid_search=False):
"""Class constructor.
Args:
a_clf (classifier or None):
classifier to use or None for default
a_grid_search (bool): use grid search for estimating
hyper-parameters
"""
classifier = a_clf
self._gs = a_grid_search
if a_clf is None:
classifier = XGBClassifier(max_depth=MAX_DEPTH,
n_estimators=NTREES,
learning_rate=ALPHA,
objective="multi:softprob")
self._clf = classifier
# latest version of XGBoost cannot deal with non-sparse feature vectors
self._model = Pipeline([("vect", DictVectorizer()),
("clf", classifier)])
def threshold_estimate(x,y):
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0)
weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
w1 = np.array([1]*y_train.shape[0])
w1[y_train==1]=weight
print("samples: %d %d %f" % (x_train.shape[0], x_test.shape[0], weight))
estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
estimator.fit(x_train, y_train, sample_weight=w1)
y_scores = estimator.predict_proba(x_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:])
m_idx = np.argmax(f1)
m_thresh = thresholds[2+m_idx]
print("%d %f %f" % (precision.shape[0], f1[m_idx], m_thresh))
return m_thresh
# Estimate threshold for the classifier using inner-round cross validation
def test_model_detection(self):
sklearn_model = LogisticRegression()
pipeline_model = Pipeline([('log', sklearn_model)])
xgb_model = XGBClassifier()
nn_model = NNModel(100,10)
sklearn_opt = Optimizer(sklearn_model,[], lambda x: x)
pipeline_opt = Optimizer(pipeline_model,[], lambda x: x)
xgb_opt = Optimizer(xgb_model,[], lambda x: x)
nn_opt = Optimizer(nn_model,[], lambda x: x)
self.assertEqual(sklearn_opt.model_module, 'sklearn')
self.assertEqual(pipeline_opt.model_module, 'pipeline')
self.assertEqual(xgb_opt.model_module, 'xgboost')
self.assertEqual(nn_opt.model_module, 'keras')
def objective(space):
estimator = XGBClassifier(
n_estimators=n_estimators,
max_depth=int(space['max_depth']),
min_child_weight=int(space['min_child_weight']),
gamma=space['gamma'],
subsample=space['subsample'],
colsample_bytree=space['colsample_bytree']
)
estimator.fit(
x_train,
y_train,
eval_set=[(x_train, y_train), (x_val, y_val)],
early_stopping_rounds=30,
verbose=False,
eval_metric='error'
)
score = accuracy_score(y_val, estimator.predict(x_val))
return {'loss': 1 - score, 'status': STATUS_OK}
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=4, cv=5):
# global dtrain_whole
global num_boost_round
global params_sklearn
# global x
# global y
for param_untuned in params_untuned:
print '========== ', param_untuned, ' =============='
print_params(params_sklearn)
estimator = xgb.XGBClassifier(**params_sklearn)
grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
grid_search.fit(x, y)
df0 = pd.DataFrame(grid_search.cv_results_)
df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
# print df0
print df
print 'the best_params : ', grid_search.best_params_
print 'the best_score : ', grid_search.best_score_
# print grid_search.cv_results_
for k,v in grid_search.best_params_.items():
params_sklearn[k] = v
if len(params_untuned)==1:
return v
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=1, cv=5):
# global dtrain_whole
global num_boost_round
global params_sklearn
# global x
# global y
for param_untuned in params_untuned:
print '========== ', param_untuned, ' =============='
print_params(params_sklearn)
estimator = xgb.XGBClassifier(**params_sklearn)
grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
grid_search.fit(x, y)
df0 = pd.DataFrame(grid_search.cv_results_)
df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
# print df0
print df
print 'the best_params : ', grid_search.best_params_
print 'the best_score : ', grid_search.best_score_
# print grid_search.cv_results_
for k,v in grid_search.best_params_.items():
params_sklearn[k] = v
def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10):
for param_untuned in params_untuned:
print '========== ', param_untuned, ' =============='
print_params(params_sklearn)
estimator = xgb.XGBClassifier(**params_sklearn)
# if(param_untuned.keys()[0] == 'n_estimators'):
# cv = 1
grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose)
grid_search.fit(x, y)
df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
print df
print 'the best_params : ', grid_search.best_params_
print 'the best_score : ', grid_search.best_score_
for k,v in grid_search.best_params_.items():
params_sklearn[k] = v
return estimator,params_sklearn
def prec_xgb(n_trees, max_depth, X_train, y_train, X_test, y_test, learning_rate=0.1):
"""
ExtraTrees
"""
import xgboost as xgb
X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))
LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=max_depth, objective='multi:softprob',
seed=0, silent=True, nthread=-1, learning_rate=learning_rate)
eval_set = [(X_test, y_test)]
clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror")
y_pred = clf.predict(X_test)
prec = float(np.sum(y_pred == y_test)) / len(y_test)
LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0))
return clf, y_pred
def get_classifier(method='logistic_regression'):
if 'logistic_regression' == method:
return LogisticRegression(C=1e3,
tol=0.01,
multi_class='ovr',
solver='liblinear',
n_jobs=-1,
random_state=123)
if 'random_forest' == method:
return RandomForestClassifier(n_estimators=250,
bootstrap=False,
n_jobs=-1,
random_state=123)
if 'gradient_boosting' == method:
return xgb.XGBClassifier(max_depth=10,
subsample=0.7,
n_estimators=500,
min_child_weight=0.05,
colsample_bytree=0.3,
learning_rate=0.1)
def prec_xgb(n_trees, max_depth, X_train, y_train, X_test, y_test, learning_rate=0.1):
"""
ExtraTrees
"""
import xgboost as xgb
X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))
LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=max_depth, objective='multi:softprob',
seed=0, silent=True, nthread=-1, learning_rate=learning_rate)
eval_set = [(X_test, y_test)]
clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror")
y_pred = clf.predict(X_test)
prec = float(np.sum(y_pred == y_test)) / len(y_test)
LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0))
return clf, y_pred
def try_params( n_iterations, params, get_predictions = False ):
n_estimators = int( round( n_iterations * trees_per_iteration ))
print "n_estimators:", n_estimators
pprint( params )
clf = XGB( n_estimators = n_estimators, nthread = -1, **params )
return train_and_eval_sklearn_classifier( clf, data )
def test_build_new_model_xgboost(self):
xgb_model = XGBClassifier(max_depth=3)
xgb_opt = Optimizer(xgb_model,[], lambda x: x)
new_model = xgb_opt.build_new_model({'max_depth': 2})
self.assertEqual(new_model.get_params()['max_depth'], 2)
def XGBoost(X, y):
print("Iniciando treinamento do XGBoost")
start_time = time.time()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=1)
clf = xgb.XGBClassifier(learning_rate=0.15, n_estimators=170, nthread=6, max_depth=8, seed=0, silent=True,
subsample=0.85, colsample_bytree=0.85)
clf.fit(X, y)
score = clf.score(X_test, y_test)
print("XGBoost score: ", score, "(", (time.time()-start_time)/60.0, "minutos )")
return clf
def __init__(self,name,kwargs):
import xgboost as xgb
kwargs = kwargs.copy()
if "random_state" in kwargs:
kwargs["seed"] = kwargs["random_state"]
kwargs.pop("random_state")
super(GCXGBClassifier,self).__init__(name,xgb.XGBClassifier,kwargs)
def XGBOUT2(bp, all_samples,train_samp,Xcoords, Ycoords, Zcoords,k,threshold,nthread,bootstrap = True):
'''Function that takes a CI test data-set and returns classification accuracy after Nearest-Neighbor Bootstrap'''
num_samp = len(all_samples)
if bootstrap:
np.random.seed()
random.seed()
I = np.random.choice(num_samp,size = num_samp, replace = True)
samples = all_samples[I,:]
else:
samples = all_samples
Xtrain,Ytrain,Xtest,Ytest,CI_data = CI_sampler_conditional_kNN(all_samples[:,Xcoords],all_samples[:,Ycoords], all_samples[:,Zcoords],train_samp,k)
model = xgb.XGBClassifier(nthread=nthread,learning_rate =0.02, n_estimators=bp['n_estimator'], max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=bp['colsample_bytree'],objective= 'binary:logistic',scale_pos_weight=1, seed=11)
gbm = model.fit(Xtrain,Ytrain)
pred = gbm.predict_proba(Xtest)
pred_exact = gbm.predict(Xtest)
acc1 = accuracy_score(Ytest, pred_exact)
AUC1 = roc_auc_score(Ytest,pred[:,1])
del gbm
gbm = model.fit(Xtrain[:,len(Xcoords)::],Ytrain)
pred = gbm.predict_proba(Xtest[:,len(Xcoords)::])
pred_exact = gbm.predict(Xtest[:,len(Xcoords)::])
acc2 = accuracy_score(Ytest, pred_exact)
AUC2 = roc_auc_score(Ytest,pred[:,1])
del gbm
if AUC1 > AUC2 + threshold:
return [0.0, AUC1 - AUC2 , AUC2 - 0.5, acc1 - acc2, acc2 - 0.5]
else:
return [1.0, AUC1 - AUC2, AUC2 - 0.5, acc1 - acc2, acc2 - 0.5]
def XGBOUT_Independence(bp, all_samples,train_samp,Xcoords, Ycoords, k,threshold,nthread,bootstrap = True):
'''Function that takes a CI test data-set and returns classification accuracy after Nearest-Neighbor Bootstrap'''
num_samp = len(all_samples)
if bootstrap:
np.random.seed()
random.seed()
I = np.random.choice(num_samp,size = num_samp, replace = True)
samples = all_samples[I,:]
else:
samples = all_samples
Xtrain,Ytrain,Xtest,Ytest,CI_data = CI_sampler_conditional_kNN(all_samples[:,Xcoords],all_samples[:,Ycoords], None,train_samp,k)
s1,s2 = Xtrain.shape
if s2 >= 4:
model = xgb.XGBClassifier(nthread=nthread,learning_rate =0.02, n_estimators=bp['n_estimator'], max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=bp['colsample_bytree'],objective= 'binary:logistic',scale_pos_weight=1, seed=11)
else:
model = xgb.XGBClassifier()
gbm = model.fit(Xtrain,Ytrain)
pred = gbm.predict_proba(Xtest)
pred_exact = gbm.predict(Xtest)
acc1 = accuracy_score(Ytest, pred_exact)
AUC1 = roc_auc_score(Ytest,pred[:,1])
del gbm
if AUC1 > 0.5 + threshold:
return [0.0, AUC1 - 0.5 , acc1- 0.5]
else:
return [1.0, AUC1 - 0.5 , acc1- 0.5]
def train_xgboost_classifier():
return mp.ModelProperties(), xgboost.XGBClassifier()
xgb_classification.py 文件源码
项目:jingjuSingingPhraseMatching
作者: ronggong
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def buildEstimators(mode):
if mode == 'train' or mode == 'cv':
# best parameters got by gridsearchCV, best score: 1
estimators = [('anova_filter', SelectKBest(f_classif, k='all')),
('xgb', xgb.XGBClassifier(learning_rate=0.1,n_estimators=300,max_depth=3))]
clf = Pipeline(estimators)
elif mode == 'test':
clf = pickle.load(open(join(classifier_path,"xgb_classifier.plk"), "r"))
return clf
def threshold_estimate_cv(x,y,k_fold):
print "%d %d %d" % (y.shape[0], sum(y==1), sum(y==0))
kf1 = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0)
threshold = np.zeros((k_fold),dtype="float32")
cnt = 0
for train_index, test_index in kf1:
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
w1 = np.array([1]*y_train.shape[0])
weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
w1 = np.array([1]*y_train.shape[0])
w1[y_train==1]=weight
estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
estimator.fit(x_train, y_train, sample_weight=w1)
y_scores = estimator.predict_proba(x_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:])
m_idx = np.argmax(f1)
threshold[cnt] = thresholds[2+m_idx]
cnt += 1
print("%d %f %f" % (precision.shape[0], f1[m_idx], thresholds[2+m_idx]))
return np.mean(threshold), threshold
# Cross validation using gradient tree boosting
def parametered_single(x_train,y_train,x_test,y_test,thresh_opt):
print("samples: %d %d %d %d" % (x_train.shape[0],x_train.shape[1],x_test.shape[0],x_test.shape[1]))
metrics = np.zeros((1,5),dtype="float32")
thresh = 0.5
# estimate the threshold
if thresh_opt==1:
thresh = threshold_estimate(x_train,y_train)
clf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=500, nthread=50)
weight = float(sum(y_train<1))/float(sum(y_train==1))
w1 = np.array([1]*y_train.shape[0])
w1[y_train==1]=weight
clf.fit(x_train, y_train, sample_weight=w1)
prob = clf.predict_proba(x_test)
yfit = (prob[:,1]>thresh)
precision, recall, f1, mcc = score_function(y_test,yfit)
metrics = np.array((thresh,precision,recall,f1,mcc))
print metrics
importances = clf.feature_importances_
indices1 = np.argsort(importances)[::-1]
features1 = np.transpose(np.array((indices1,importances[indices1])))
pred = np.transpose(np.array((y_test,yfit)))
return metrics, pred, prob, features1
# Cross validation for PEP-Word
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def xgb0(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
return df.drop(['time'], axis=1)
logging.info("train xgb0 model")
clf = xgb.XGBClassifier()
clf.fit(prepare_feats(df_cell_train_feats), y_train)
y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
return y_test_pred
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def xgb150opt(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
return df.drop(['time'], axis=1)
logging.info("train xgb150opt model")
clf = xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=3, subsample=0.667, colsample_bytree=1)
clf.fit(prepare_feats(df_cell_train_feats), y_train)
y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
return y_test_pred
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def xgb150opt2(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
return df.drop(['time'], axis=1)
logging.info("train xgb150opt2 model")
clf = xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=1, subsample=0.85263, colsample_bytree=0.657894, reg_alpha=1.55556, reg_lambda=1.22222, gamma=0.3333333)
clf.fit(prepare_feats(df_cell_train_feats), y_train)
y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
return y_test_pred
dsb_create_voxel_model_predictions.py 文件源码
项目:data-science-bowl-2017
作者: tondonia
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def __init__(self, trainX, trainY):
self.trainX = trainX
self.trainY = trainY
self.level0 = xgb.XGBClassifier(learning_rate=0.325,
silent=True,
objective="binary:logistic",
nthread=-1,
gamma=0.85,
min_child_weight=5,
max_delta_step=1,
subsample=0.85,
colsample_bytree=0.55,
colsample_bylevel=1,
reg_alpha=0.5,
reg_lambda=1,
scale_pos_weight=1,
base_score=0.5,
seed=0,
missing=None,
n_estimators=1920, max_depth=6)
self.h_param_grid = {'max_depth': hp.quniform('max_depth', 1, 13, 1),
'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
'learning_rate': hp.quniform('learning_rate', 0.025, 0.5, 0.025),
'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
'n_estimators': hp.quniform('n_estimators', 10, 200, 5),
}
self.to_int_params = ['n_estimators', 'max_depth']
resnet_classifier.py 文件源码
项目:Brain_Tumor_Segmentation
作者: KarthikRevanuru
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def train_xgboost():
df = pd.read_csv('survival_data.csv', index_col=0)
p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
y=np.array([])
t=0
z=np.array([])
for ind in range(len(folder_names_train)):
try:
temp = df.get_value(str(folder_names_train[ind]),'Class')
y=np.append(y,temp)
temp = df.get_value(str(folder_names_train[ind]),'Age')
z=np.append(z,np.array([temp]))
except Exception as e:
t+=1
print (t,str(e),"Label Not found, deleting entry")
y=np.append(y,0)
z=np.array([[v] for v in z])
t=np.concatenate((p,q),axis=1)
u=np.concatenate((r,s),axis=1)
x=np.concatenate((t,u),axis=1)
#print(x.shape)
#print (x)
#print (x.shape,z.shape)
x=np.concatenate((x,z),axis=1)
#print (x)
#clf=linear_model.LogisticRegression(C=1e5)
#clf = RandomForestClassifier()
clf = xgb.XGBClassifier()
clf.fit(x,y)
return clf
def _get_model(self):
if self._model == 'xgb':
return XGBClassifier()
if self._model == 'svc_rbf':
return SVC()
if self._model == 'svc_lin':
return LinearSVC()
return RFC()
def models():
params = {'n_jobs':nthread,'random_state':seed,'class_weight':None}
# extra = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features='auto',criterion= 'entropy',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
# extra1 = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=60,criterion= 'gini',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)
# rf = ensemble.RandomForestClassifier(n_estimators=1000,max_features= 'auto',criterion= 'gini',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
# rf1 = ensemble.RandomForestClassifier(n_estimators=1000,max_features=60,criterion= 'entropy',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)
# xgb_binlog = XGBClassifier(objective="binary:logistic" ,max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
# xgb_reglog = XGBClassifier(objective="reg:logistic", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
# xgb_poi = XGBClassifier(objective="count:poisson", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
# xgb_reglin = XGBClassifier(objective="reg:linear", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
rf_params = {'n_estimators':850,'max_features':60,'criterion':'entropy','min_samples_split': 4,'max_depth': 40, 'min_samples_leaf': 2, 'n_jobs': -1}
clfs = [
# (D1, XGBRegressor(objective="reg:linear", max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
(D1, XGBClassifier(objective="binary:logistic" ,max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
# (D1, XGBRegressor(objective="reg:linear", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
# (D1,XGBClassifier(objective="binary:logistic", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
# (D1, XGBRegressor(objective="reg:linear", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
# (D1,XGBClassifier(objective="binary:logistic", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
]
for clf in clfs:
yield clf
def test_classifier(loop): # noqa
with cluster() as (s, [a, b]):
with Client(s['address'], loop=loop):
a = dxgb.XGBClassifier()
X2 = da.from_array(X, 5)
y2 = da.from_array(y, 5)
a.fit(X2, y2)
p1 = a.predict(X2)
b = xgb.XGBClassifier()
b.fit(X, y)
np.testing.assert_array_almost_equal(a.feature_importances_,
b.feature_importances_)
assert_eq(p1, b.predict(X))
def fit(self, X, y=None):
"""Fit a gradient boosting classifier
Parameters
----------
X : array-like [n_samples, n_features]
Feature Matrix. May be a dask.array or dask.dataframe
y : array-like
Labels
Returns
-------
self : XGBClassifier
Notes
-----
This differs from the XGBoost version in three ways
1. The ``sample_weight``, ``eval_set``, ``eval_metric``,
``early_stopping_rounds`` and ``verbose`` fit kwargs are not
supported.
2. The labels are not automatically label-encoded
3. The ``classes_`` and ``n_classes_`` attributes are not learned
"""
client = default_client()
xgb_options = self.get_xgb_params()
self._Booster = train(client, xgb_options, X, y,
num_boost_round=self.n_estimators)
return self