def run_grid_search(self):
"""
This method is called by derived class to start grid search process
"""
features,labels,cv_folds = self.getFeaturesLabel()
dtrain_cv = xgb.DMatrix(features, label= labels,feature_names=features.columns)
parameter_iterable = self.__get_param_iterable(self.__get_param_grid())
kwargs = self.get_learning_params()
for param in parameter_iterable:
logging.info("used parameters: {}".format(param))
bst = xgb.cv(param, dtrain_cv, folds=cv_folds,**kwargs)
self.__add_to_resultset(param, bst)
self.__disp_result()
return
python类cv()的实例源码
def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10):
for param_untuned in params_untuned:
print '========== ', param_untuned, ' =============='
print_params(params_sklearn)
estimator = xgb.XGBClassifier(**params_sklearn)
# if(param_untuned.keys()[0] == 'n_estimators'):
# cv = 1
grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose)
grid_search.fit(x, y)
df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
print df
print 'the best_params : ', grid_search.best_params_
print 'the best_score : ', grid_search.best_score_
for k,v in grid_search.best_params_.items():
params_sklearn[k] = v
return estimator,params_sklearn
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
0.3, 0.6, 1],
max_iter=50000, cv=10)
# lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
# 0.3, 0.6, 1], cv=10)
lasso.fit(x_train_split, y_train_split)
y_predicted = lasso.predict(X=x_test_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['label'].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc')
# Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
# Print model report:
print "\nModel Report"
print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
def clean_params_for_sk(params: dict) -> dict:
"""
Given a dictionary of XGB parameters, return a copy without parameters that will cause issues with scikit-learn's grid or
randomized search estimators.
:param params:
A dictionary of XGB parameters.
:return:
A copy of the same dictionary without the aforementioned problematic parameters.
"""
# In the xgb.cv call, nthread should be equal to the CPU count, but this causes a hang when
# called through GridSearchCV - parallelism should be achieved through its n_jobs parameter.
# See https://github.com/scikit-learn/scikit-learn/issues/6627 for more details.
params_copy = params.copy()
params_copy['nthread'] = 1
# In multiclass problems, this parameter is required for XGBoost, but is not a parameter of interest to be tuned.
if 'num_class' in params_copy.keys():
del params_copy['num_class']
return params_copy
def predict():
saved = state.load('model')
#saved = None
if debug_mode:
saved = None
if saved == None:
train, y, test, _ = data.get()
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if debug_mode:
saved = None
if saved == None:
train, y, test, _ = data.get()
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if saved == None:
train, y, test, _ = data.get()
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if debug_mode:
saved = None
if saved == None:
train, y, test, _ = data.get()
ftrain, ftest, _ = fea_1.get()
ftrain2, ftest2, _ = fea_2.get()
train = pd.concat([train, ftrain, ftrain2], axis=1)
test = pd.concat([test, ftest, ftest2], axis=1)
print(train.shape, test.shape)
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if saved == None:
train, y, test, _ = data.get()
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if debug_mode:
saved = None
if saved == None:
train, y, test, _ = data.get()
ftrain, ftest, _ = fea_1.get()
ftrain2, ftest2, _ = fea_2.get()
train = pd.concat([train, ftrain, ftrain2], axis=1)
test = pd.concat([test, ftest, ftest2], axis=1)
print(train.shape, test.shape)
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
#xgboost???????
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def rmse_cv(model, x_train, y_train):
rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=5))
return rmse
def run_croos_validation(self):
features,labels,cv_folds = self.getFeaturesLabel()
dtrain_cv = xgb.DMatrix(features, label= labels,feature_names=features.columns)
self.set_xgb_parameters()
# specify validations set to watch performance
model = xgb.cv(self.xgb_params, dtrain_cv, folds=cv_folds, **self.xgb_learning_params)
best_scroe = model[self.best_score_colname_in_cv].max()
return best_scroe
def tune_n_estimators(alg,xgtrain,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds,verbose_eval=True, show_stdv=True)
# alg.set_params(n_estimators=cvresult.shape[0])
return cvresult.shape[0]
def modelfit(alg, predictors, target, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(predictors.values, label=target.values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,\
metrics=['auc'], early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(predictors, target, eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(predictors)
dtrain_predprob = alg.predict_proba(predictors)[:, 1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(target.values, dtrain_predictions))
print("AUC Score (Train): %f" % metrics.roc_auc_score(target, dtrain_predprob))
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
# examples of usage
# 1
def score(params):
logging.info("Training with params: ")
logging.info(params)
# Delete 'n_estimators' because it's only a constructor param
# when you're using XGB's sklearn API.
# Instead, we have to save 'n_estimators' (# of boosting rounds)
# to xgb.cv().
num_boost_round = int(params['n_estimators'])
del params['n_estimators']
dtrain = xgb.DMatrix(X_train, label=y_train)
# As of version 0.6, XGBoost returns a dataframe of the following form:
# boosting iter | mean_test_err | mean_test_std | mean_train_err | mean_train_std
# boost iter 1 mean_test_iter1 | mean_test_std1 | ... | ...
# boost iter 2 mean_test_iter2 | mean_test_std2 | ... | ...
# ...
# boost iter n_estimators
score_history = xgb.cv(params, dtrain, num_boost_round,
nfold=5, stratified=True,
early_stopping_rounds=250,
verbose_eval=500)
# Only use scores from the final boosting round since that's the one
# that performed the best.
mean_final_round = score_history.tail(1).iloc[0, 0]
std_final_round = score_history.tail(1).iloc[0, 1]
logging.info("\tMean Score: {0}\n".format(mean_final_round))
logging.info("\tStd Dev: {0}\n\n".format(std_final_round))
# score() needs to return the loss (1 - score)
# since optimize() should be finding the minimum, and AUC
# naturally finds the maximum.
loss = 1 - mean_final_round
return {'loss': loss, 'status': STATUS_OK}
def rmse_cv(model, x_train, y_train):
rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=5))
return rmse
def regression_with_xgboost(x_train, y_train, X_test, Y_test, features=None, use_cv=True, use_sklean=False, xgb_params=None):
train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan'))
test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan'))
evallist = [(test_data,'eval'), (train_data,'train')]
#if xgb_params == None:
# xgb_params = get_default_xgboost_params()
if not use_cv:
num_rounds = 10
else:
cvresult = xgb.cv(xgb_params, train_data, num_boost_round=100, nfold=5,
metrics={'rmse'}, show_progress=True)
print cvresult
num_rounds = len(cvresult)
gbdt = None
if(use_sklean):
#gbdt = xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None)
xgb_params['n_estimators'] = num_rounds
gbdt = xgboost.XGBRegressor(xgb_params)
gbdt.fit(x_train, y_train)
y_pred = gbdt.predict(X_test)
return gbdt, y_pred
else:
#gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5)
gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True)
ceate_feature_map_for_feature_importance(features)
show_feature_importance(gbdt, feature_names=features)
y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float("nan")))
return XGBoostModel(gbdt), y_pred
Stock_Prediction_Model_XgBoost.py 文件源码
项目:StockRecommendSystem
作者: doncat99
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def GBM(self, argsDict):
max_depth = argsDict["max_depth"] + 10
subsample = argsDict["subsample"] * 0.1 + 0.5
#n_estimators = argsDict['n_estimators'] * 5 + 50
learning_rate = argsDict["learning_rate"] * 0.02 + 0.12
#gamma = argsDict["gamma"] * 0.1
#min_child_weight = argsDict["min_child_weight"] + 1
print("max_depth:" + str(max_depth), "learning_rate:" + str(learning_rate), "subsample:" + str(subsample))
params={
"max_depth":max_depth,
#"gamma":gamma,
'subsample' : subsample,
'learning_rate' : learning_rate,
#'subsample' : subsample,
#'min_child_weight': min_child_weight,
'objective': "multi:softmax",
'num_class': 7 ,
"eval_metric":'merror',
'silent':False,
# 'gpu_id':1,
# 'max_bin':16,
# 'tree_method': "gpu_exact",
# 'updater':'grow_gpu',
# 'n_gpus':-1,
# 'predictor' : "gpu_predictor",
}
num_round = 1
model=xgb.train(params,self.train, num_round, self.watchlist, feval=Xg_iter_precision)
cov_res=xgb.cv(params,self.train, num_round, nfold=5, feval=Xg_iter_precision)
#print(cov_res.head())
cov_rec=cov_res.tail(1)['test-precision_4_5_6-mean'].values
predicted=model.predict(self.test)
scoring=precision_score( self.test_y,predicted,average='micro',labels=[4,5,6])
print('precision is ',scoring)
print('cv_precision_4_5_6',cov_rec[0])
return -cov_rec[0]
def tune_num_estimators(metric: str,
label: np.ndarray,
params: dict,
strat_folds: StratifiedKFold,
train) -> Tuple[int, float]:
"""
Uses xgboost's cross-validation method to tune the number of estimators and returns that along with the best CV score
achieved.
:param metric:
Evaluation metric that is monitored during cross-validation - e.g. 'logloss' or 'rmse'.
:param label:
An array-like containing the labels of the classification or regression problem.
:param params:
A dictionary of XGB parameters.
:param strat_folds:
A StratifiedKFold object to cross validate the parameters.
:param train:
An array-like containing the training input samples.
:return:
A tuple containing the tuned number of estimators along with the best CV score achieved.
"""
eval_hist = xgb.cv(
dtrain=xgb.DMatrix(train, label=label),
early_stopping_rounds=50,
folds=strat_folds,
metrics=metric,
num_boost_round=10000,
params=params,
verbose_eval=True
)
num_trees = eval_hist.shape[0]
best_score = eval_hist.values[num_trees - 1, 0]
return num_trees, best_score
def build(self):
train, y, test, _ = data_src.get()
xgb_params = dict(
max_depth = 5,
learning_rate = 0.005,
subsample = 0.7,
gamma = 5,
alpha = 0.01,
#colsample_bytree = 0.8,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
idx = (test.smoke > 0).values * (test.smoke < 1).values
print('values to restore:', np.sum(idx))
xtrain = pd.concat([train, test[~idx]])
ytrain = xtrain['smoke']
xtrain.drop('smoke', axis=1, inplace=True)
print(xtrain.shape, ytrain.shape, test[idx].shape)
dtrain = xgb.DMatrix(xtrain.values, ytrain.values)
dpred = xgb.DMatrix(test[idx].drop('smoke', axis=1).values)
cv = xgb.cv(params=xgb_params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=50,
nfold=10,
seed=1,
metrics='error',
stratified=True)
print('smoke num_boost_rounds =', len(cv))
bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
test.ix[idx, 'smoke'] = bst.predict(dpred)
test['smoke'] = (test['smoke'] > 0.5) * 1
return train, y, test, None