def process_fold(X_train, X_val, y_train, y_val, X_test):
#XGBoos
clf = OneVsRestClassifier(xgb.XGBClassifier(learning_rate=0.005, n_estimators=500))
clf.fit(X_train, y_train)
y_p_x = clf.predict_proba(X_val)
y_p_x_tst = clf.predict_proba(X_test)
# Keras
y_p_k, y_p_k_tst = KerasClassifier(X_train, y_train, X_val, y_val, X_test)
return (y_p_x+y_p_k) / 2.0, (y_p_x_tst+y_p_k_tst) / 2.0
python类XGBClassifier()的实例源码
Stage4_KerasXGBoostMEUFsubmission.py 文件源码
项目:Yelp
作者: alexander-rakhlin
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def train_xgb(X_train, y_train, X_test, y_test):
n_trees = 1000
X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))
LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=5, objective='multi:softprob',
seed=0, silent=True, nthread=-1, learning_rate=0.1)
eval_set = [(X_test, y_test)]
clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror", early_stopping_rounds=10)
y_pred = clf.predict(X_test)
prec = float(np.sum(y_pred == y_test)) / len(y_test)
LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0))
return clf, y_pred
def xgboostcv(max_depth,
learning_rate,
n_estimators,
subsample,
colsample_bytree,
gamma,
min_child_weight,
silent=True,
nthread=-1,
seed=1234):
clf = XGBClassifier(max_depth=int(max_depth),
learning_rate=learning_rate,
n_estimators=int(n_estimators),
silent=silent,
nthread=nthread,
subsample=subsample,
colsample_bytree=colsample_bytree,
gamma=gamma,
min_child_weight = min_child_weight,
seed=seed,
objective="binary:logistic")
clf.fit(x0, y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25)
ll = -log_loss(y1, clf.predict_proba(x1))
return ll
def xgboo():
# Gradient Boosted Trees to grid search
model = XGBClassifier(seed=random_state, nthread=8)
parameters = {'max_depth': [3, 6, 9], 'n_estimators': [50, 100, 200, 400]}
grid = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
return grid
def xgboostcv(max_depth,
learning_rate,
n_estimators,
# gamma,
# min_child_weight,
# max_delta_step,
subsample,
colsample_bytree,
ratio=131.708,
silent =True,
nthread = -1,
seed = 42):
return cross_val_score(XGBClassifier(max_depth = int(max_depth),
learning_rate = learning_rate,
n_estimators = int(n_estimators),
silent = silent,
nthread = nthread,
# gamma = gamma,
# min_child_weight = min_child_weight,
# max_delta_step = max_delta_step,
subsample = subsample,
colsample_bytree = colsample_bytree,
scale_pos_weight = ratio,
seed = seed),
X,
y,
scoring='f1',
cv=5).mean()
def select_mdl(self, mdl_type, param):
"""
# define classifier and parameters
:param mdl_type: specify which model to initialize
:param param: a dict storing model parameters
"""
if (mdl_type == 'xgb'):
self.mdl = xgb.XGBClassifier(**param)
elif (mdl_type == 'lr'):
self.mdl = LogisticRegression(**param)
elif (mdl_type == 'rf'):
self.mdl = RandomForestClassifier(**param)
def pipeline(self):
# This is a property for serialization support with xgboost,
# because we change self.clf after __init__.
pipeline = [self.vec]
if isinstance(self.clf, XGBClassifier):
# Work around xgboost issue:
# https://github.com/dmlc/xgboost/issues/1238#issuecomment-243872543
pipeline.append(CSCTransformer())
pipeline.append(self.clf)
return make_pipeline(*pipeline)
def explain_predictions(self, docs, top=30):
if not isinstance(self.clf, XGBClassifier):
raise NotImplementedError
booster = self.clf.booster()
xgb_feature_names = {f: i for i, f in enumerate(booster.feature_names)}
feature_names = get_feature_names(self.clf, self.vec,
num_features=len(xgb_feature_names))
feature_names.bias_name = '<BIAS>'
X = self.vec.transform(docs)
X = X.tocsc()
dmatrix = DMatrix(X, missing=self.clf.missing)
leaf_ids = booster.predict(dmatrix, pred_leaf=True)
tree_dumps = booster.get_dump(with_stats=True)
docs_weights = []
for i, _leaf_ids in enumerate(leaf_ids):
all_weights = _target_feature_weights(
_leaf_ids, tree_dumps,
feature_names=feature_names,
xgb_feature_names=xgb_feature_names)[1]
weights = np.zeros_like(all_weights)
idx = X[i].nonzero()[1]
bias_idx = feature_names.bias_idx
weights[idx] = all_weights[idx]
weights[bias_idx] = all_weights[bias_idx]
docs_weights.append(weights)
weights = np.mean(docs_weights, axis=0)
feature_weights = get_top_features(
feature_names=np.array(
[_prettify_feature(f) for f in feature_names]),
coef=weights,
top=top)
return Explanation(
estimator=type(self.clf).__name__,
targets=[TargetExplanation('y', feature_weights=feature_weights)],
)
def get_attributes(obj):
if isinstance(obj, TfidfVectorizer):
return get_tfidf_attributes(obj)
elif isinstance(obj, XGBClassifier):
return pickle.dumps(obj)
elif isinstance(obj, BaseEstimator):
return {attr: getattr(obj, attr) for attr in dir(obj)
if not attr.startswith('_') and attr.endswith('_')
and attr not in skip_attributes}
elif obj is not None:
raise TypeError(type(obj))
def set_attributes(parent, field, attributes):
obj = getattr(parent, field)
if isinstance(obj, TfidfVectorizer):
set_ifidf_attributes(obj, attributes)
elif isinstance(obj, XGBClassifier):
setattr(parent, field, pickle.loads(attributes))
elif isinstance(obj, BaseEstimator):
for k, v in attributes.items():
try:
setattr(obj, k, v)
except AttributeError:
raise AttributeError(
'can\'t set attribute {} on {}'.format(k, obj))
elif obj is not None:
raise TypeError(type(obj))
xgradient_boosting.py 文件源码
项目:AutoML-Challenge
作者: postech-mlg-exbrain
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def fit(self, X, y):
import xgboost as xgb
self.learning_rate = float(self.learning_rate)
self.n_estimators = int(self.n_estimators)
self.subsample = float(self.subsample)
self.max_depth = int(self.max_depth)
# (TODO) Gb used at most half of the features, here we use all
self.colsample_bylevel = float(self.colsample_bylevel)
self.colsample_bytree = float(self.colsample_bytree)
self.gamma = float(self.gamma)
self.min_child_weight = int(self.min_child_weight)
self.max_delta_step = int(self.max_delta_step)
self.reg_alpha = float(self.reg_alpha)
self.reg_lambda = float(self.reg_lambda)
self.nthread = int(self.nthread)
self.base_score = float(self.base_score)
self.scale_pos_weight = float(self.scale_pos_weight)
# We don't support multilabel, so we only need 1 objective function
if len(numpy.unique(y)) == 2:
# We probably have binary classification
self.objective = 'binary:logistic'
else:
self.objective = 'multi:softprob'
self.estimator = xgb.XGBClassifier(
max_depth=self.max_depth,
learning_rate=self.learning_rate,
n_estimators=self.n_estimators,
silent=self.silent,
objective=self.objective,
nthread=self.nthread,
gamma=self.gamma,
scale_pos_weight=self.scale_pos_weight,
min_child_weight=self.min_child_weight,
max_delta_step=self.max_delta_step,
subsample=self.subsample,
colsample_bytree=self.colsample_bytree,
colsample_bylevel=self.colsample_bylevel,
reg_alpha=self.reg_alpha,
reg_lambda=self.reg_lambda,
base_score=self.base_score,
seed=self.seed
)
self.estimator.fit(X, y, eval_metric='auc')
return self
def parametered_cv(x,y,k_fold,k_fold1):
print("samples: %d %d %d %d" % (x.shape[0],x.shape[1],k_fold,k_fold1))
kf = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0)
index = []
label = []
yfit = []
metrics = np.zeros((k_fold,5),dtype="float32")
thresholds = []
predicted = np.array([[0,0]])
features1 = np.array([[0,0]])
thresh = 0.5
cnt = 0
print "Positive: %d Negative: %d" % (sum(y==1), sum(y==0))
for train_index, test_index in kf:
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
print y_train.shape
print("%d %d %d %d" % (x_train.shape[0], x_train.shape[1], x_test.shape[0], x_test.shape[1]))
if k_fold1>1:
thresh, thresh_vec = threshold_estimate_cv(x_train,y_train,k_fold1)
elif k_fold1==1:
thresh = threshold_estimate(x_train,y_train)
else:
thresh = 0.5
print("%d %f" % (x_train.shape[0], thresh))
weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
w1 = np.array([1]*y_train.shape[0])
w1[y_train==1]=weight
weight1 = float(len(y_test[y_test == 0]))/float(len(y_test[y_test == 1]))
clf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
clf.fit(x_train, y_train, sample_weight=w1)
prob = clf.predict_proba(x_test)
yfit1 = (prob[:,1]>thresh)
index = np.concatenate((index,test_index),axis=0)
label = np.concatenate((label,y_test),axis=0)
yfit = np.concatenate((yfit,yfit1),axis=0)
precision, recall, f1, mcc = score_function(y_test,yfit1)
metrics[cnt,:] = np.array((thresh,precision,recall,f1,mcc))
print metrics[cnt,:]
cnt += 1
predicted = np.concatenate((predicted,prob),axis=0)
importances = clf.feature_importances_
indices1 = np.argsort(importances)[::-1]
feature_1 = np.transpose(np.array((indices1,importances[indices1])))
features1 = np.concatenate((features1,feature_1),axis=0)
pred = np.transpose(np.array((index,label,yfit)))
aver_metrics = np.mean(metrics,axis=0)
aver_metrics = np.reshape(aver_metrics,(1,metrics.shape[1]))
metrics_1 = np.concatenate((metrics,aver_metrics),axis=0)
print aver_metrics
return metrics_1, pred, predicted[1:,], features1[1:,]
# Single run using gradient tree boosting
def online(X_org, y_org, test_x, test_uid):
n_folds = 5
verbose = True
shuffle = False
X = X_org
y = y_org
X_submission = test_x
if shuffle:
idx = np.random.permutation(y.size)
X = X[idx]
y = y[idx]
skf = list(StratifiedKFold(y, n_folds))
clfs = [
RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})),
ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})),
GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})),
LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})),
xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})),
xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})),
]
print "Creating train and test sets for blending."
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, clf in enumerate(clfs):
print j, clf
dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
for i, (train, test) in enumerate(skf):
print "Fold", i
X_train = X[train]
y_train = y[train]
X_test = X[test]
y_test = y[test]
clf.fit(X_train, y_train)
y_submission = clf.predict_proba(X_test)[:,1]
dataset_blend_train[test, j] = y_submission
dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
print "Blending."
# clf = LogisticRegression(C=2, penalty='l2', class_weight='balanced', n_jobs=-1)
clf = linear_model.RidgeCV(
alphas=np.linspace(0, 200), cv=LM_CV_NUM)
# clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=100)
clf.fit(dataset_blend_train, y)
# y_submission = clf.predict_proba(dataset_blend_test)[:,1]
print clf.coef_, clf.intercept_
y_submission = clf.predict(dataset_blend_test) # for RidgeCV
print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print "blend result"
save_submission(os.path.join(consts.SUBMISSION_PATH,
MODEL_NAME + '_' + strftime("%m_%d_%H_%M_%S", localtime()) + '.csv'),
test_uid, y_submission)
def online2(X_org, y_org, test_x, test_uid):
n_folds = 5
verbose = True
shuffle = False
X = X_org
y = y_org
X_submission = test_x
if shuffle:
idx = np.random.permutation(y.size)
X = X[idx]
y = y[idx]
skf = list(StratifiedKFold(y, n_folds))
clfs = [
RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})),
ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})),
GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})),
LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})),
# xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})),
# xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})),
]
print "Creating train and test sets for blending."
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, clf in enumerate(clfs):
print j, clf
dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
for i, (train, test) in enumerate(skf):
print "Fold", i
X_train = X[train]
y_train = y[train]
clf.fit(X_train, y_train)
dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
save_submission(os.path.join(consts.SUBMISSION_PATH,
clf.__class__.__name__ + '_' + strftime("%m_%d_%H_%M_%S", localtime()) + '.csv'),
test_uid, dataset_blend_test[:, j])
def models():
extra_params_kaggle_cla = {'n_estimators':1200,'max_features':30,'criterion':'entropy',
'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}
extra_params_kaggle_reg = {'n_estimators':1200,'max_features':30,'criterion':'mse',
'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}
xgb_reg = {'objective':'reg:linear', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}
xgb_cla = {'objective':'binary:logistic', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}
#NN params
nb_epoch = 3
batch_size = 128
esr = 402
param1 = {
'hidden_units': (256, 256),
'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
}
param2 = {
'hidden_units': (1024, 1024),
'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
}
clfs = [
(D2, XGBClassifier(**xgb_cla)),
(D11, XGBClassifier(**xgb_cla)),
(D2, XGBRegressor(**xgb_reg)),
(D11, XGBRegressor(**xgb_reg)),
(D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
(D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
(D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
(D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
# (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)),
# (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
# (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
#
# (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
# (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
# (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2))
]
for clf in clfs:
yield clf
def tune_xgb_params_segment_by_grid(estimator_cls: Type[Union[xgb.XGBClassifier, xgb.XGBRegressor]],
label: np.ndarray,
metric_sklearn: str,
n_jobs: int,
param_grid: dict,
params: dict,
strat_folds: StratifiedKFold,
train: np.ndarray,
verbosity_level: int = 10) -> Tuple[dict, float]:
"""
Grid search over a segment of XGBoost parameters.
:param estimator_cls:
The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
:param label:
An array-like containing the labels of the classification or regression problem.
:param metric_sklearn:
The evaluation metric to be passed to scikit-learn's GridSearchCV - see
http://scikit-learn.org/stable/modules/model_evaluation.html
for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
:param n_jobs:
The number of jobs to run simultaneously.
:param param_grid:
A dictionary of the grid of parameters to be searched over - e.g. {'colsample_bytree': range(0.5, 0.9, 0.1)} to search
values [0.5, 0.6, 0.7, 0.8].
:param params:
A dictionary of XGB parameters.
:param strat_folds:
A StratifiedKFold object to cross validate the parameters.
:param train:
An array-like containing the training input samples.
:param verbosity_level:
An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
:return:
A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
"""
params_copy = clean_params_for_sk(params)
grid = GridSearchCV(
cv=strat_folds.split(train, label),
estimator=estimator_cls(**params_copy),
n_jobs=n_jobs,
param_grid=param_grid,
scoring=metric_sklearn,
verbose=verbosity_level
)
grid.fit(train, label)
best_score = grid.best_score_
# Massage the score to be in line with what xgboost reports
if metric_sklearn == 'neg_mean_squared_error':
best_score = abs(best_score) ** 0.5
elif metric_sklearn == 'neg_log_loss':
best_score = abs(best_score)
return {k: grid.best_params_[k] for k in param_grid.keys()}, best_score
def tune_xgb_params_randomized(estimator_cls,
label: np.ndarray,
metric_sklearn: str,
n_jobs: int,
params: dict,
strat_folds: StratifiedKFold,
train: np.ndarray,
n_iter: int = 20,
verbosity_level: int = 10,
**kwargs):
"""
:param estimator_cls:
The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
:param label:
An array-like containing the labels of the classification or regression problem.
:param metric_sklearn:
The evaluation metric to be passed to scikit-learn's GridSearchCV - see
http://scikit-learn.org/stable/modules/model_evaluation.html
for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
:param n_jobs:
The number of jobs to run simultaneously.
:param params:
A dictionary of XGB parameters.
:param strat_folds:
A StratifiedKFold object to cross validate the parameters.
:param train:
An array-like containing the training input samples.
:param n_iter:
An optional parameter to control the number of parameter settings that are sampled.
:param n_jobs:
An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available.
:param verbosity_level:
An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
:param kwargs:
Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for
colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2.
:return:
A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
"""
params_copy = clean_params_for_sk(params)
param_distributions = {
'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)),
'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)),
'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)),
'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)),
'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8))
}
rand_search = RandomizedSearchCV(
cv=strat_folds.split(train, label),
estimator=estimator_cls(**params_copy),
n_iter=n_iter,
n_jobs=n_jobs,
param_distributions=param_distributions,
scoring=metric_sklearn,
verbose=verbosity_level
)
rand_search.fit(train, label)
return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]