python类DMatrix()的实例源码

102_xgb_holdout_None_813_3.py 文件源码 项目:Instacart 作者: KazukiOnodera 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    label = dbuild.get_label()
    scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)

    print('scale_pos_weight', scale_pos_weight)
    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist, scale_pos_weight
901_reorder_base.py 文件源码 项目:Instacart 作者: KazukiOnodera 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist

#==============================================================================
902_reorder.py 文件源码 项目:Instacart 作者: KazukiOnodera 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist

#==============================================================================
wrap_xgb.py 文件源码 项目:gestalt 作者: mpearmain 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def fit(self, X, y, x_val=None, y_val=None):

        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds)
        return
wrap_xgb.py 文件源码 项目:gestalt 作者: mpearmain 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def fit(self, X, y, x_val=None, y_val=None):
        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 verbose_eval=self.verbose)
        return
xgb_train.py 文件源码 项目:jdata 作者: learn2Pro 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['label'].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc')

    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]

    # Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
model.py 文件源码 项目:zhihu-machine-learning-challenge-2017 作者: HouJP 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def fit(self,
            train_fs, train_labels,
            valid_fs, valid_labels):
        rank_k = self.config.getint('RANK', 'rank_k')

        train_DMatrix = xgb.DMatrix(train_fs, label=train_labels)
        train_DMatrix.set_group([rank_k] * (len(train_labels) / rank_k))
        valid_DMatrix = xgb.DMatrix(valid_fs, label=valid_labels)
        valid_DMatrix.set_group([rank_k] * (len(valid_labels) / rank_k))

        watchlist = [(train_DMatrix, 'train'), (valid_DMatrix, 'valid')]
        # self.__lock()
        self.model = xgb.train(self.params,
                               train_DMatrix,
                               self.params['num_round'],
                               watchlist,
                               early_stopping_rounds=self.params['early_stop'],
                               verbose_eval=self.params['verbose_eval'])
        LogUtil.log('INFO', 'best_ntree_limit=%d' % self.model.best_ntree_limit)
        # self.__unlock()
        valid_preds = self.model.predict(valid_DMatrix, ntree_limit=self.model.best_ntree_limit)
        return valid_preds
est_utils.py 文件源码 项目:gcforest 作者: w821881341 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def xgb_train(train_config, X_train, y_train, X_test, y_test):
    import xgboost as xgb
    LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    param = train_config["param"]
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    num_round = int(train_config["num_round"])
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    try:
        bst = xgb.train(param, xg_train, num_round, watchlist)
    except KeyboardInterrupt:
        LOGGER.info("Canceld by user's Ctrl-C action")
        return
    y_pred = np.argmax(bst.predict(xg_test), axis=1)
    acc = 100. * np.sum(y_pred == y_test) / len(y_test)
    LOGGER.info("accuracy={}%".format(acc))
train.py 文件源码 项目:JData 作者: Xls1994 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def xgboost_make_submission(retrain = False):
    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'
    if os.path.exists('./cache/bstmodel.bin') and not retrain:
        bst = xgb.Booster({'ntheard':4})
        bst.load_model('./cache/bstmodel.bin')
    else:
        bst = xgboost_train()
    sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date, )
    sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
    y = bst.predict(sub_trainning_data)
    sub_user_index['label'] = y
    pred = sub_user_index[sub_user_index['label'] >= 0.03]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    dt = datetime.datetime.now()
    sdt = str(dt.date())+str(dt.hour)+str(dt.minute)+str(dt.second)
    pred.to_csv('./sub/submission_%s.csv' % sdt, index=False, index_label=False)
    # P = get_sku_ids_in_P()
train.py 文件源码 项目:JData 作者: Xls1994 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def xgboost_test_offline():
    bst = xgboost_train(True)
    P = get_sku_ids_in_P()
    labels = get_labels('2016-04-11','2016-04-16')
    sub_user_index, sub_trainning_data = make_test_set('2016-04-11', '2016-04-16', )
    sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
    y = bst.predict(sub_trainning_data)
    sub_user_index['label'] = y
    pred = sub_user_index[sub_user_index['label'] >= 0.03]
    # pred = sub_user_index
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    # pred = pred[pred['sku_id'].isin(P)]
    labels = labels[labels['label']==1]
    labels['user_id'] = labels['user_id'].astype(int)
    labels = labels[['user_id','sku_id']]
    labels = labels[labels['sku_id'].isin(P)]
    eval.eval(pred,labels)

    pass
main.py 文件源码 项目:Sberbank 作者: dimaquick 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def Features(my, prodShift):                                                                                                                                       
    Xtrain, Ytrain, Xvalid, Yvalid = [], [], [], []                                                                                      
    keys = []
    for u in my.Users:
        for m in my.MccList:
            for month in xrange(15 + prodShift):
                if month < 13 + prodShift: continue
                f = my.Features(u, m, month)
                ans = math.log(1.0 + my.Answers[u + '_' + m][month])
                if month == 14 + prodShift:
                    if u not in my.ValidUsers: continue
                    Xvalid.append(f)
                    Yvalid.append(ans)
                    keys.append([u, m])
                else:
                    Xtrain.append(f)
                    Ytrain.append(ans)
    Xtrain, Ytrain, Xvalid, Yvalid = map(np.asarray, [Xtrain, Ytrain, Xvalid, Yvalid])
    return xgboost.DMatrix(Xtrain, Ytrain), xgboost.DMatrix(Xvalid, Yvalid), Yvalid, keys
predict_2017_06_28_1.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_01_4.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_06_27_3.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
XGBoostClassifier.py 文件源码 项目:ensemble_amazon 作者: kaz-Anova 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def build_matrix(self, X, opt_y=None, weighting=None):
    if opt_y==None: 
        if weighting==None:
            return xgb.DMatrix(csr_matrix(X), missing =-999.0)
        else :
            #scale weight
            sumtotal=float(X.shape[0])
            sumweights=np.sum(weighting)            
            for s in range(0,len(weighting)):
                weighting[s]*=sumtotal/sumweights
            return xgb.DMatrix(csr_matrix(X), missing =-999.0, weight=weighting)            
    else:
        if weighting==None:           
            return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0)
        else :
            sumtotal=float(X.shape[0])
            sumweights=np.sum(weighting)            
            for s in range(0,len(weighting)):
                weighting[s]*=sumtotal/sumweights             
            return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0, weight=weighting)
XGBoostClassifier.py 文件源码 项目:ensemble_amazon 作者: kaz-Anova 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def predict(self, X): 
    if  self.k_models!=None and len(self.k_models)<2:
        X1 = self.build_matrix(X)
        return self.bst.predict(X1)
    else :
        dtest = xgb.DMatrix(X)
        preds= [0.0 for k in X.shape[0]]
        for gbdt in self.k_models:
            predsnew = gbdt.predict(dtest, ntree_limit=(gbdt.best_iteration+1)*self.num_parallel_tree)  
            for g in range (0, predsnew.shape[0]):
                preds[g]+=predsnew[g]
        for g in range (0, len(preds)):
            preds[g]/=float(len(self.k_models))
xgb_rank.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def predict(self,Xt,Xg,load_model=None):
        print("load_model",load_model)
        dtest = xgb.DMatrix(Xt)
        dtest.set_group(Xg)
        if load_model and self.bst is None:
            self.bst = xgb.Booster(self.params,model_file=load_model)
        return self.bst.predict(dtest)
xgb_model.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def fit(self,X,y,Xt=None,yt=None,
        load_model=None,save_model=None,
        obj=None,feval=None,print_fscore=True,evalx=None):
        print(X.shape,y.shape)

        num_round = self.params.get('num_round',100)
        early_stopping_rounds = self.params.get('early_stopping_rounds',None)
        maximize = self.params.get('maximize',False)
        dtrain = xgb.DMatrix(X, y)
        vb = self.params.get('verbose_eval',1)
        if Xt is not None:
            dvalid = xgb.DMatrix(Xt, yt)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                early_stopping_rounds=early_stopping_rounds,verbose_eval=vb,
                xgb_model=load_model,obj=obj,feval=feval,maximize=maximize)
        else:
            watchlist = [(dtrain, 'train')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                verbose_eval=vb,xgb_model=load_model,obj=obj,feval=feval)
        self.bst = bst
        if save_model is not None:
            bst.save_model(save_model)            

        fscore = self.feature_importance()
        if print_fscore:
            print("Feature Importance:")
            for i in fscore:
                print(i) 
        if Xt is not None and evalx is not None:
            yp = self.predict(Xt)
            score = evalx(yt,yp)
            print(score)
            return score
        return 0
xgbbasemodel.py 文件源码 项目:Supply-demand-forecasting 作者: LevinJ 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def run_croos_validation(self):

        features,labels,cv_folds = self.getFeaturesLabel()
        dtrain_cv  = xgb.DMatrix(features, label= labels,feature_names=features.columns)
        self.set_xgb_parameters()

        # specify validations set to watch performance
        model = xgb.cv(self.xgb_params, dtrain_cv, folds=cv_folds, **self.xgb_learning_params)
        best_scroe = model[self.best_score_colname_in_cv].max()
        return best_scroe


问题


面经


文章

微信
公众号

扫码关注公众号