predict_2017_07_06_3.py 文件源码-python代码片段

def restore_missing(df, N_splits = 10):
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.005,
            gamma = 1,
            alpha = 0.01,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    #{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
    df.ix[df.active == -1, 'active'] = 1
    df.ix[df.alco == -1, 'alco'] = 0

    label = 'smoke'
    print('before', label, '{{{', df[label].value_counts(), '}}}')
    xtrain = df[df[label] > -1].copy()
    ytrain = xtrain[label].astype('int32').values
    xtrain = xtrain.drop(label, axis=1)
    #print(label, ytrain.value_counts())

    xpred = df[df[label] == -1].copy()
    ypred = xpred[label] * 0
    xpred = xpred.drop(label, axis=1)

    dpred = xgb.DMatrix(xpred)
    dtrain = xgb.DMatrix(xtrain, label=ytrain)

    cv = xgb.cv(params=xgb_params,
                dtrain=dtrain,
                num_boost_round=10000,
                early_stopping_rounds=100,
                nfold=10,
                metrics='error',
                stratified=True)
    print(label, 'num_boost_rounds =', len(cv))
    bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
    ypred += bst.predict(dpred)
    df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
    print('restored', label, '{{{', df[label].value_counts(), '}}}')