def split_build_valid():
train_user['is_valid'] = np.random.choice([0,1], size=len(train_user),
p=[1-valid_size, valid_size])
valid_n = train_user['is_valid'].sum()
build_n = (train_user.shape[0] - valid_n)
print('build user:{}, valid user:{}'.format(build_n, valid_n))
valid_user = train_user[train_user['is_valid']==1].user_id
is_valid = X_train.user_id.isin(valid_user)
dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
label = dbuild.get_label()
scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)
print('scale_pos_weight', scale_pos_weight)
print('FINAL SHAPE')
print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
(dvalid.num_row(), dvalid.num_col())))
return dbuild, dvalid, watchlist, scale_pos_weight
python类DMatrix()的实例源码
def split_build_valid():
train_user['is_valid'] = np.random.choice([0,1], size=len(train_user),
p=[1-valid_size, valid_size])
valid_n = train_user['is_valid'].sum()
build_n = (train_user.shape[0] - valid_n)
print('build user:{}, valid user:{}'.format(build_n, valid_n))
valid_user = train_user[train_user['is_valid']==1].user_id
is_valid = X_train.user_id.isin(valid_user)
dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
print('FINAL SHAPE')
print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
(dvalid.num_row(), dvalid.num_col())))
return dbuild, dvalid, watchlist
#==============================================================================
def split_build_valid():
train_user['is_valid'] = np.random.choice([0,1], size=len(train_user),
p=[1-valid_size, valid_size])
valid_n = train_user['is_valid'].sum()
build_n = (train_user.shape[0] - valid_n)
print('build user:{}, valid user:{}'.format(build_n, valid_n))
valid_user = train_user[train_user['is_valid']==1].user_id
is_valid = X_train.user_id.isin(valid_user)
dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
print('FINAL SHAPE')
print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
(dvalid.num_row(), dvalid.num_col())))
return dbuild, dvalid, watchlist
#==============================================================================
def fit(self, X, y, x_val=None, y_val=None):
dtrain = xgb.DMatrix(X, label=y)
if x_val is not None:
dtest = xgb.DMatrix(x_val, label=y_val)
watchlist = [(dtrain, 'train'), (dtest, 'validation')]
self.clf = xgb.train(params=self.params,
dtrain=dtrain,
num_boost_round=self.num_round,
early_stopping_rounds=self.early_stopping_rounds,
evals=watchlist,
verbose_eval=self.verbose)
else:
self.clf = xgb.train(params=self.params,
dtrain=dtrain,
num_boost_round=self.num_round,
early_stopping_rounds=self.early_stopping_rounds)
return
def fit(self, X, y, x_val=None, y_val=None):
dtrain = xgb.DMatrix(X, label=y)
if x_val is not None:
dtest = xgb.DMatrix(x_val, label=y_val)
watchlist = [(dtrain, 'train'), (dtest, 'validation')]
self.xgb = xgb.train(params=self.params,
dtrain=dtrain,
num_boost_round=self.num_round,
early_stopping_rounds=self.early_stopping_rounds,
evals=watchlist,
verbose_eval=self.verbose)
else:
self.xgb = xgb.train(params=self.params,
dtrain=dtrain,
num_boost_round=self.num_round,
early_stopping_rounds=self.early_stopping_rounds,
verbose_eval=self.verbose)
return
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['label'].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc')
# Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
# Print model report:
print "\nModel Report"
print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
def fit(self,
train_fs, train_labels,
valid_fs, valid_labels):
rank_k = self.config.getint('RANK', 'rank_k')
train_DMatrix = xgb.DMatrix(train_fs, label=train_labels)
train_DMatrix.set_group([rank_k] * (len(train_labels) / rank_k))
valid_DMatrix = xgb.DMatrix(valid_fs, label=valid_labels)
valid_DMatrix.set_group([rank_k] * (len(valid_labels) / rank_k))
watchlist = [(train_DMatrix, 'train'), (valid_DMatrix, 'valid')]
# self.__lock()
self.model = xgb.train(self.params,
train_DMatrix,
self.params['num_round'],
watchlist,
early_stopping_rounds=self.params['early_stop'],
verbose_eval=self.params['verbose_eval'])
LogUtil.log('INFO', 'best_ntree_limit=%d' % self.model.best_ntree_limit)
# self.__unlock()
valid_preds = self.model.predict(valid_DMatrix, ntree_limit=self.model.best_ntree_limit)
return valid_preds
def xgb_train(train_config, X_train, y_train, X_test, y_test):
import xgboost as xgb
LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape))
param = train_config["param"]
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
num_round = int(train_config["num_round"])
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
try:
bst = xgb.train(param, xg_train, num_round, watchlist)
except KeyboardInterrupt:
LOGGER.info("Canceld by user's Ctrl-C action")
return
y_pred = np.argmax(bst.predict(xg_test), axis=1)
acc = 100. * np.sum(y_pred == y_test) / len(y_test)
LOGGER.info("accuracy={}%".format(acc))
def xgboost_make_submission(retrain = False):
sub_start_date = '2016-03-15'
sub_end_date = '2016-04-16'
if os.path.exists('./cache/bstmodel.bin') and not retrain:
bst = xgb.Booster({'ntheard':4})
bst.load_model('./cache/bstmodel.bin')
else:
bst = xgboost_train()
sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date, )
sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
y = bst.predict(sub_trainning_data)
sub_user_index['label'] = y
pred = sub_user_index[sub_user_index['label'] >= 0.03]
pred = pred[['user_id', 'sku_id']]
pred = pred.groupby('user_id').first().reset_index()
pred['user_id'] = pred['user_id'].astype(int)
dt = datetime.datetime.now()
sdt = str(dt.date())+str(dt.hour)+str(dt.minute)+str(dt.second)
pred.to_csv('./sub/submission_%s.csv' % sdt, index=False, index_label=False)
# P = get_sku_ids_in_P()
def xgboost_test_offline():
bst = xgboost_train(True)
P = get_sku_ids_in_P()
labels = get_labels('2016-04-11','2016-04-16')
sub_user_index, sub_trainning_data = make_test_set('2016-04-11', '2016-04-16', )
sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
y = bst.predict(sub_trainning_data)
sub_user_index['label'] = y
pred = sub_user_index[sub_user_index['label'] >= 0.03]
# pred = sub_user_index
pred = pred[['user_id', 'sku_id']]
pred = pred.groupby('user_id').first().reset_index()
pred['user_id'] = pred['user_id'].astype(int)
# pred = pred[pred['sku_id'].isin(P)]
labels = labels[labels['label']==1]
labels['user_id'] = labels['user_id'].astype(int)
labels = labels[['user_id','sku_id']]
labels = labels[labels['sku_id'].isin(P)]
eval.eval(pred,labels)
pass
def Features(my, prodShift):
Xtrain, Ytrain, Xvalid, Yvalid = [], [], [], []
keys = []
for u in my.Users:
for m in my.MccList:
for month in xrange(15 + prodShift):
if month < 13 + prodShift: continue
f = my.Features(u, m, month)
ans = math.log(1.0 + my.Answers[u + '_' + m][month])
if month == 14 + prodShift:
if u not in my.ValidUsers: continue
Xvalid.append(f)
Yvalid.append(ans)
keys.append([u, m])
else:
Xtrain.append(f)
Ytrain.append(ans)
Xtrain, Ytrain, Xvalid, Yvalid = map(np.asarray, [Xtrain, Ytrain, Xvalid, Yvalid])
return xgboost.DMatrix(Xtrain, Ytrain), xgboost.DMatrix(Xvalid, Yvalid), Yvalid, keys
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def build_matrix(self, X, opt_y=None, weighting=None):
if opt_y==None:
if weighting==None:
return xgb.DMatrix(csr_matrix(X), missing =-999.0)
else :
#scale weight
sumtotal=float(X.shape[0])
sumweights=np.sum(weighting)
for s in range(0,len(weighting)):
weighting[s]*=sumtotal/sumweights
return xgb.DMatrix(csr_matrix(X), missing =-999.0, weight=weighting)
else:
if weighting==None:
return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0)
else :
sumtotal=float(X.shape[0])
sumweights=np.sum(weighting)
for s in range(0,len(weighting)):
weighting[s]*=sumtotal/sumweights
return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0, weight=weighting)
def predict(self, X):
if self.k_models!=None and len(self.k_models)<2:
X1 = self.build_matrix(X)
return self.bst.predict(X1)
else :
dtest = xgb.DMatrix(X)
preds= [0.0 for k in X.shape[0]]
for gbdt in self.k_models:
predsnew = gbdt.predict(dtest, ntree_limit=(gbdt.best_iteration+1)*self.num_parallel_tree)
for g in range (0, predsnew.shape[0]):
preds[g]+=predsnew[g]
for g in range (0, len(preds)):
preds[g]/=float(len(self.k_models))
def predict(self,Xt,Xg,load_model=None):
print("load_model",load_model)
dtest = xgb.DMatrix(Xt)
dtest.set_group(Xg)
if load_model and self.bst is None:
self.bst = xgb.Booster(self.params,model_file=load_model)
return self.bst.predict(dtest)
def fit(self,X,y,Xt=None,yt=None,
load_model=None,save_model=None,
obj=None,feval=None,print_fscore=True,evalx=None):
print(X.shape,y.shape)
num_round = self.params.get('num_round',100)
early_stopping_rounds = self.params.get('early_stopping_rounds',None)
maximize = self.params.get('maximize',False)
dtrain = xgb.DMatrix(X, y)
vb = self.params.get('verbose_eval',1)
if Xt is not None:
dvalid = xgb.DMatrix(Xt, yt)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
early_stopping_rounds=early_stopping_rounds,verbose_eval=vb,
xgb_model=load_model,obj=obj,feval=feval,maximize=maximize)
else:
watchlist = [(dtrain, 'train')]
bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
verbose_eval=vb,xgb_model=load_model,obj=obj,feval=feval)
self.bst = bst
if save_model is not None:
bst.save_model(save_model)
fscore = self.feature_importance()
if print_fscore:
print("Feature Importance:")
for i in fscore:
print(i)
if Xt is not None and evalx is not None:
yp = self.predict(Xt)
score = evalx(yt,yp)
print(score)
return score
return 0
def run_croos_validation(self):
features,labels,cv_folds = self.getFeaturesLabel()
dtrain_cv = xgb.DMatrix(features, label= labels,feature_names=features.columns)
self.set_xgb_parameters()
# specify validations set to watch performance
model = xgb.cv(self.xgb_params, dtrain_cv, folds=cv_folds, **self.xgb_learning_params)
best_scroe = model[self.best_score_colname_in_cv].max()
return best_scroe