def build(self):
train, y, test, _ = data_src.get()
xgb_params = dict(
max_depth = 5,
learning_rate = 0.005,
subsample = 0.7,
gamma = 5,
alpha = 0.01,
#colsample_bytree = 0.8,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
idx = (test.smoke > 0).values * (test.smoke < 1).values
print('values to restore:', np.sum(idx))
xtrain = pd.concat([train, test[~idx]])
ytrain = xtrain['smoke']
xtrain.drop('smoke', axis=1, inplace=True)
print(xtrain.shape, ytrain.shape, test[idx].shape)
dtrain = xgb.DMatrix(xtrain.values, ytrain.values)
dpred = xgb.DMatrix(test[idx].drop('smoke', axis=1).values)
cv = xgb.cv(params=xgb_params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=50,
nfold=10,
seed=1,
metrics='error',
stratified=True)
print('smoke num_boost_rounds =', len(cv))
bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
test.ix[idx, 'smoke'] = bst.predict(dpred)
test['smoke'] = (test['smoke'] > 0.5) * 1
return train, y, test, None
评论列表
文章目录