def predict_proba(self, X):
try:
rows=(X.shape[0])
except:
rows=len(X)
X1 = self.build_matrix(X)
if self.k_models!=None and len(self.k_models)<2:
predictions = self.bst.predict(X1)
else :
dtest = xgb.DMatrix(X)
predictions= None
for gbdt in self.k_models:
predsnew = gbdt.predict(dtest, ntree_limit=(gbdt.best_iteration+1)*self.num_parallel_tree)
if predictions==None:
predictions=predsnew
else:
for g in range (0, predsnew.shape[0]):
predictions[g]+=predsnew[g]
for g in range (0, len(predictions)):
predictions[g]/=float(len(self.k_models))
predictions=np.array(predictions)
if self.objective == 'multi:softprob': return predictions.reshape( rows, self.num_class)
return np.vstack([1 - predictions, predictions]).T
python类DMatrix()的实例源码
def fit(self,X,y,Xg,Xt=None,yt=None,Xgt=None,load_model=None,save_model=None):
print(X.shape,y.shape)
num_round = self.params['num_round']
early_stopping_rounds = self.params['early_stopping_rounds']
dtrain = xgb.DMatrix(X, y)
dtrain.set_group(Xg)
if Xt is not None:
dvalid = xgb.DMatrix(Xt, yt)
dvalid.set_group(Xgt)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
early_stopping_rounds=early_stopping_rounds,verbose_eval=1,xgb_model=load_model,
maximize=True)
else:
watchlist = [(dtrain, 'train')]
bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
verbose_eval=1,xgb_model=load_model)
self.bst = bst
if save_model is not None:
bst.save_model(save_model)
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
y_test_split, title_name):
# Split the training data into an extra set of test
# x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual y')
plt.ylabel('Predicted y')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def predict_with_gbm(X, y, model):
"""
Args:
X:
y:
model:
Returns:
"""
assert model['model_name'] == 'GBM',\
'Wrong model name in model info: {}. Need GBM.'.format(model['model_name'])
testData = xgb.DMatrix(data=X, label=y.nMut.values, feature_names=model['feature_names'])
testData.set_base_margin(np.array(np.log(y.length+1/y.N) + np.log(y.N)))
kfold = model['kfold']
pred = np.zeros(y.shape[0])
for k in range(1, kfold+1):
model['model'][k].set_param(model['params']) # Bypass a bug of dumping without max_delta_step
pred += model['model'][k].predict(testData)
pred = pred / kfold
return pred
def run_grid_search(self):
"""
This method is called by derived class to start grid search process
"""
features,labels,cv_folds = self.getFeaturesLabel()
dtrain_cv = xgb.DMatrix(features, label= labels,feature_names=features.columns)
parameter_iterable = self.__get_param_iterable(self.__get_param_grid())
kwargs = self.get_learning_params()
for param in parameter_iterable:
logging.info("used parameters: {}".format(param))
bst = xgb.cv(param, dtrain_cv, folds=cv_folds,**kwargs)
self.__add_to_resultset(param, bst)
self.__disp_result()
return
def predict(self, X):
'''
transform ASLib scenario data
Arguments
---------
X: numpy.array
instance feature matrix
Returns
-------
'''
preds = np.array(self.model.predict(xgb.DMatrix(X)))
preds[preds < 0.5] = 0
preds[preds >= 0.5] = 1
return preds
def crate_pre_train_model(x_,y_):
(x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1)
(y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1)
dtrain = xgb.DMatrix( x_train, label=y_train)
dtest = xgb.DMatrix( x_test, label=y_test)
evallist = [(dtrain,'train'),(dtest,'eval')]
param = {'objective':'reg:linear','max_depth':3 }
param['nthread'] = 64
#param['min_child_weight'] = 15
#param['subsample'] = 1
#param['num_class'] = 7
plst = param.items()
num_round = 5000
bst = xgb.train( plst, dtrain, num_round,
evallist,early_stopping_rounds=100,
#obj=logregobj,
feval=evalerror
)
return bst
# %% main
def as_dmatrix(self):
path = self.dmatrix_cache_path
# xgb is not try/except friendly here
if os.path.exists(path):
dm = xgb.DMatrix(path, feature_names=self.feature_names,
feature_types=(self.feature_types if FTYPES else None)
)
else:
logging.info('Cache miss on dmatrix. Building and caching.')
dm = self._as_dmatrix()
dm.save_binary(path)
# We add on weights (if any) after the fact, to avoid proliferation of big
# serialized dmatrix files.
if self.weight_mode != 'none':
weights = self.get_weights()
dm.set_weight(weights)
return dm
def predict_test_prob(bst):
df_all=loadCSV('data/first_merge/test_join_v9.csv')
df_sta_lgbm=loadCSV('data/stacking/prob_lgbm_test.csv')
print('????')
df_all=pd.merge(df_all,df_sta_lgbm,how='left',on='instanceID')
del df_sta_lgbm
instanceID=df_all.instanceID.values
feature_all=df_all.drop(['label','clickTime','instanceID',
'residence','appCategory'],axis=1).values
del df_all
dtest=xgb.DMatrix(feature_all)
prob=bst.predict(dtest)
output=pd.DataFrame({'instanceID':instanceID,'prob':prob})
output.to_csv('result/submission2.csv',index=False)
#????
def xgb_train(train_config, X_train, y_train, X_test, y_test):
import xgboost as xgb
LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape))
param = train_config["param"]
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
num_round = int(train_config["num_round"])
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
try:
bst = xgb.train(param, xg_train, num_round, watchlist)
except KeyboardInterrupt:
LOGGER.info("Canceld by user's Ctrl-C action")
return
y_pred = np.argmax(bst.predict(xg_test), axis=1)
acc = 100. * np.sum(y_pred == y_test) / len(y_test)
LOGGER.info("accuracy={}%".format(acc))
def data_pre_process(train_path, test_path, label, drop_list=None):
train_dataset = pandas.read_csv(train_path)
if drop_list:
train_dataset = train_dataset.drop(drop_list, axis=1)
y_train = train_dataset[label].astype(int)
print y_train.dtypes
X_train = train_dataset.drop(label, axis=1)
test_dataset = pandas.read_csv(test_path)
if drop_list:
test_dataset = test_dataset.drop(drop_list, axis=1)
y_test = test_dataset[label].astype(int)
print y_test.dtypes
X_test = test_dataset.drop(label, axis=1)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
return dtrain, dtest
def train_relatedness_classifier(trainX, trainY):
xg_train = xgb.DMatrix(trainX, label=trainY)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'binary:logistic'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 20
num_round = 1000
relatedness_classifier = xgb.train(param, xg_train, num_round);
return relatedness_classifier
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not HAS_XGBOOST:
return
if not HAS_SKLEARN:
return
scikit_data = load_boston()
dtrain = xgboost.DMatrix(scikit_data.data, label = scikit_data.target,
feature_names = scikit_data.feature_names)
xgb_model = xgboost.train({}, dtrain, 1)
# Save the data and the model
self.scikit_data = scikit_data
self.xgb_model = xgb_model
self.feature_names = self.scikit_data.feature_names
def regression_with_xgboost_no_cv(x_train, y_train, X_test, Y_test, features=None, xgb_params=None, num_rounds = 10):
train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan'))
test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan'))
evallist = [(train_data,'train'), (test_data,'eval')]
if xgb_params is None:
xgb_params = get_default_xgboost_params()
print "xgb_params not found"
print "XGBoost, using param", xgb_params
gbdt = xgb.train(xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5)
isgbtree = xgb_params["booster"] == "gbtree"
if isgbtree :
ceate_feature_map_for_feature_importance(features)
show_feature_importance(gbdt)
y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')), ntree_limit=gbdt.best_ntree_limit)
else:
y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')))
return XGBoostModel(gbdt), y_pred
s12_run_xgboost_only_train_create.py 文件源码
项目:KAGGLE_AVITO_2016
作者: ZFTurbo
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def run_train_with_model(train, features, model_path):
start_time = time.time()
gbm = xgb.Booster()
gbm.load_model(model_path)
print("Validating...")
check = gbm.predict(xgb.DMatrix(train[features]))
score = roc_auc_score(train['isDuplicate'].values, check)
validation_df = pd.DataFrame({'itemID_1': train['itemID_1'].values, 'itemID_2': train['itemID_2'].values,
'isDuplicate': train['isDuplicate'].values, 'probability': check})
print('AUC score value: {:.6f}'.format(score))
imp = get_importance(gbm, features)
print('Importance array: ', imp)
print('Prediction time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
return validation_df, score
def run_test_with_model(train, test, features, model_path):
start_time = time.time()
gbm = xgb.Booster()
gbm.load_model(model_path)
print("Validating...")
check = gbm.predict(xgb.DMatrix(train[features]))
score = roc_auc_score(train['isDuplicate'].values, check)
validation_df = pd.DataFrame({'isDuplicate': train['isDuplicate'].values, 'probability': check})
# print(validation_df)
print('AUC score value: {:.6f}'.format(score))
# score1 = roc_auc_score(validation_df['isDuplicate'].values, validation_df['probability'])
# print('AUC score check value: {:.6f}'.format(score1))
imp = get_importance(gbm, features)
print('Importance array: ', imp)
print("Predict test set...")
test_prediction = gbm.predict(xgb.DMatrix(test[features]))
print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
return test_prediction.tolist(), validation_df, score
def ExtGBDTEnsemblePredict(sub_clf_num, predict_x):
"""
????????
:param sub_clf_num: ??????
:param predict_x: ??????feature
:return: socre: ndarray, ????
"""
total_score = np.zeros(len(predict_x)) # ?????????????????
for i in range(sub_clf_num):
predict_X = xgb.DMatrix(predict_x)
model_file = '../model/model' + str(i)
bst = pickle.load(open(model_file, 'r'))
predict_y = bst.predict(predict_X)
total_score += predict_y
score = total_score / sub_clf_num
return score
def ExtGBDT(train_x, train_y, test_x, test_y):
""" Ext-GBDT """
num_round = 100
param = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eta': 0.03, 'max_depth': 3, 'eval_metric': 'auc',
'silent': 1, 'min_child_weight': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'nthread': 4,
'max_delta_step': 0}
train_X = xgb.DMatrix(train_x, train_y)
test_X = xgb.DMatrix(test_x)
bst = xgb.train(param, train_X, num_round)
pred = bst.predict(test_X)
predict_y = []
for i in range(len(pred)):
if pred[i] < 0.5:
predict_y.append(0)
else:
predict_y.append(1)
auc = evaluate_auc(pred, test_y)
evaluate(predict_y, test_y)
return auc
def runXGB(train_X, train_y, seed_val=123):
param = {}
param['objective'] = 'multi:softprob'
param['eta'] = 0.05
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 22
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 2
param['subsample'] = 0.9
param['colsample_bytree'] = 0.9
param['seed'] = seed_val
num_rounds = 115
plst = list(param.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
model = xgb.train(plst, xgtrain, num_rounds)
return model
def cross_validate(train):
#separate training and validation set
X_train,X_valid= split_train_validation(train)
scores = []; preds = []
for i in xrange(len(X_train)):
#convert X_train, Y_train etc... to xgboost matrix
dtrain = xgb.DMatrix(X_train[i][['phone_brand','device_model','timestamp']], label = X_train[i]['group'],missing=np.nan)
dvalid = xgb.DMatrix(X_valid[i][['phone_brand','device_model','timestamp']], label = X_valid[i]['group'],missing=np.nan)
#predict with xgboost
parameters = {'max_depth':4,'eta':0.1,'silent':1, 'subsample':0.8,'colsample_bytree':0.8,
'objective':'multi:softprob','booster':'gbtree','early_stopping_rounds':50,
'num_class':12,'num_boost_round':1000,'eval_metric':'mlogloss'}
plst = parameters.items()
bst = xgb.train(plst, dtrain)
pred = bst.predict(dvalid)
scores.append(log_loss(X_valid[i]['group'].tolist(),pred))
pred = pd.DataFrame(pred, index = X_valid[i].index, columns=target_encoder.classes_)
preds.append(pred)
return scores, preds
def test_basic(c, s, a, b):
dtrain = xgb.DMatrix(df, label=labels)
bst = xgb.train(param, dtrain)
ddf = dd.from_pandas(df, npartitions=4)
dlabels = dd.from_pandas(labels, npartitions=4)
dbst = yield dxgb._train(c, param, ddf, dlabels)
dbst = yield dxgb._train(c, param, ddf, dlabels) # we can do this twice
result = bst.predict(dtrain)
dresult = dbst.predict(dtrain)
correct = (result > 0.5) == labels
dcorrect = (dresult > 0.5) == labels
assert dcorrect.sum() >= correct.sum()
predictions = dxgb.predict(c, dbst, ddf)
assert isinstance(predictions, dd.Series)
predictions = yield c.compute(predictions)._result()
assert isinstance(predictions, pd.Series)
assert ((predictions > 0.5) != labels).sum() < 2
def test_dmatrix_kwargs(c, s, a, b):
xgb.rabit.init() # workaround for "Doing rabit call after Finalize"
dX = da.from_array(X, chunks=(2, 2))
dy = da.from_array(y, chunks=(2,))
dbst = yield dxgb._train(c, param, dX, dy, {"missing": 0.0})
# Distributed model matches local model with dmatrix kwargs
dtrain = xgb.DMatrix(X, label=y, missing=0.0)
bst = xgb.train(param, dtrain)
result = bst.predict(dtrain)
dresult = dbst.predict(dtrain)
assert np.abs(result - dresult).sum() < 0.02
# Distributed model gives bad predictions without dmatrix kwargs
dtrain_incompat = xgb.DMatrix(X, label=y)
dresult_incompat = dbst.predict(dtrain_incompat)
assert np.abs(result - dresult_incompat).sum() > 0.02
def test_numpy(c, s, a, b):
xgb.rabit.init() # workaround for "Doing rabit call after Finalize"
dX = da.from_array(X, chunks=(2, 2))
dy = da.from_array(y, chunks=(2,))
dbst = yield dxgb._train(c, param, dX, dy)
dbst = yield dxgb._train(c, param, dX, dy) # we can do this twice
dtrain = xgb.DMatrix(X, label=y)
bst = xgb.train(param, dtrain)
result = bst.predict(dtrain)
dresult = dbst.predict(dtrain)
correct = (result > 0.5) == y
dcorrect = (dresult > 0.5) == y
assert dcorrect.sum() >= correct.sum()
predictions = dxgb.predict(c, dbst, dX)
assert isinstance(predictions, da.Array)
predictions = yield c.compute(predictions)._result()
assert isinstance(predictions, np.ndarray)
assert ((predictions > 0.5) != labels).sum() < 2
def test_synchronous_api(loop): # noqa
dtrain = xgb.DMatrix(df, label=labels)
bst = xgb.train(param, dtrain)
ddf = dd.from_pandas(df, npartitions=4)
dlabels = dd.from_pandas(labels, npartitions=4)
with cluster() as (s, [a, b]):
with Client(s['address'], loop=loop) as c:
dbst = dxgb.train(c, param, ddf, dlabels)
result = bst.predict(dtrain)
dresult = dbst.predict(dtrain)
correct = (result > 0.5) == labels
dcorrect = (dresult > 0.5) == labels
assert dcorrect.sum() >= correct.sum()
def split_build_valid():
train_user['is_valid'] = np.random.choice([0,1], size=len(train_user),
p=[1-valid_size, valid_size])
valid_n = train_user['is_valid'].sum()
build_n = (train_user.shape[0] - valid_n)
print('build user:{}, valid user:{}'.format(build_n, valid_n))
valid_user = train_user[train_user['is_valid']==1].user_id
is_valid = X_train.user_id.isin(valid_user)
dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
print('FINAL SHAPE')
print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
(dvalid.num_row(), dvalid.num_col())))
return dbuild, dvalid, watchlist
#==============================================================================
def split_build_valid():
train_user['is_valid'] = np.random.choice([0,1], size=len(train_user),
p=[1-valid_size, valid_size])
valid_n = train_user['is_valid'].sum()
build_n = (train_user.shape[0] - valid_n)
print('build user:{}, valid user:{}'.format(build_n, valid_n))
valid_user = train_user[train_user['is_valid']==1].user_id
is_valid = X_train.user_id.isin(valid_user)
dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
print('FINAL SHAPE')
print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
(dvalid.num_row(), dvalid.num_col())))
return dbuild, dvalid, watchlist
#==============================================================================
def split_build_valid():
train_user['is_valid'] = np.random.choice([0,1], size=len(train_user),
p=[1-valid_size, valid_size])
valid_n = train_user['is_valid'].sum()
build_n = (train_user.shape[0] - valid_n)
print('build user:{}, valid user:{}'.format(build_n, valid_n))
valid_user = train_user[train_user['is_valid']==1].user_id
is_valid = X_train.user_id.isin(valid_user)
dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
label = dbuild.get_label()
scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)
print('scale_pos_weight', scale_pos_weight)
print('FINAL SHAPE')
print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
(dvalid.num_row(), dvalid.num_col())))
return dbuild, dvalid, watchlist, scale_pos_weight
def split_build_valid():
train_user['is_valid'] = np.random.choice([0,1], size=len(train_user),
p=[1-valid_size, valid_size])
valid_n = train_user['is_valid'].sum()
build_n = (train_user.shape[0] - valid_n)
print('build user:{}, valid user:{}'.format(build_n, valid_n))
valid_user = train_user[train_user['is_valid']==1].user_id
is_valid = X_train.user_id.isin(valid_user)
dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
label = dbuild.get_label()
scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)
print('scale_pos_weight', scale_pos_weight)
print('FINAL SHAPE')
print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
(dvalid.num_row(), dvalid.num_col())))
return dbuild, dvalid, watchlist, scale_pos_weight
def split_build_valid():
train_user['is_valid'] = np.random.choice([0,1], size=len(train_user),
p=[1-valid_size, valid_size])
valid_n = train_user['is_valid'].sum()
build_n = (train_user.shape[0] - valid_n)
print('build user:{}, valid user:{}'.format(build_n, valid_n))
valid_user = train_user[train_user['is_valid']==1].user_id
is_valid = X_train.user_id.isin(valid_user)
dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
label = dbuild.get_label()
scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)
print('scale_pos_weight', scale_pos_weight)
print('FINAL SHAPE')
print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
(dvalid.num_row(), dvalid.num_col())))
return dbuild, dvalid, watchlist, scale_pos_weight