def restore_missing(df, N_splits = 10):
xgb_params = dict(
max_depth = 5,
learning_rate = 0.005,
gamma = 1,
alpha = 0.01,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
#{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
df.ix[df.active == -1, 'active'] = 1
df.ix[df.alco == -1, 'alco'] = 0
label = 'smoke'
print('before', label, '{{{', df[label].value_counts(), '}}}')
xtrain = df[df[label] > -1].copy()
ytrain = xtrain[label].astype('int32').values
xtrain = xtrain.drop(label, axis=1)
#print(label, ytrain.value_counts())
xpred = df[df[label] == -1].copy()
ypred = xpred[label] * 0
xpred = xpred.drop(label, axis=1)
dpred = xgb.DMatrix(xpred)
dtrain = xgb.DMatrix(xtrain, label=ytrain)
cv = xgb.cv(params=xgb_params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=100,
nfold=10,
metrics='error',
stratified=True)
print(label, 'num_boost_rounds =', len(cv))
bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
ypred += bst.predict(dpred)
df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
print('restored', label, '{{{', df[label].value_counts(), '}}}')
评论列表
文章目录