def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
cname2 = cname + str(s)
v[cname2], z[cname2] = 0, 0
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname2] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname2] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname2] /= N_splits
vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
python类cv()的实例源码
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def restore_missing(df, N_splits = 10):
xgb_params = dict(
max_depth = 5,
learning_rate = 0.005,
gamma = 1,
alpha = 0.01,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
#{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
df.ix[df.active == -1, 'active'] = 1
df.ix[df.alco == -1, 'alco'] = 0
label = 'smoke'
print('before', label, '{{{', df[label].value_counts(), '}}}')
xtrain = df[df[label] > -1].copy()
ytrain = xtrain[label].astype('int32').values
xtrain = xtrain.drop(label, axis=1)
#print(label, ytrain.value_counts())
xpred = df[df[label] == -1].copy()
ypred = xpred[label] * 0
xpred = xpred.drop(label, axis=1)
dpred = xgb.DMatrix(xpred)
dtrain = xgb.DMatrix(xtrain, label=ytrain)
cv = xgb.cv(params=xgb_params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=100,
nfold=10,
metrics='error',
stratified=True)
print(label, 'num_boost_rounds =', len(cv))
bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
ypred += bst.predict(dpred)
df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
print('restored', label, '{{{', df[label].value_counts(), '}}}')
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
cname2 = cname + str(s)
v[cname2], z[cname2] = 0, 0
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname2] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname2] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname2] /= N_splits
z[cname] += z[cname2] / N_seeds
v[cname] += v[cname2] / N_seeds
vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
cname2 = cname + str(s)
v[cname2], z[cname2] = 0, 0
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname2] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname2] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname2] /= N_splits
vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def restore_missing(df, N_splits = 10):
xgb_params = dict(
max_depth = 5,
learning_rate = 0.005,
gamma = 1,
alpha = 0.01,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
#{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
df.ix[df.active == -1, 'active'] = 1
df.ix[df.alco == -1, 'alco'] = 0
label = 'smoke'
print('before', label, '{{{', df[label].value_counts(), '}}}')
xtrain = df[df[label] > -1].copy()
ytrain = xtrain[label].astype('int32').values
xtrain = xtrain.drop(label, axis=1)
#print(label, ytrain.value_counts())
xpred = df[df[label] == -1].copy()
ypred = xpred[label] * 0
xpred = xpred.drop(label, axis=1)
dpred = xgb.DMatrix(xpred)
dtrain = xgb.DMatrix(xtrain, label=ytrain)
cv = xgb.cv(params=xgb_params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=100,
nfold=10,
metrics='error',
stratified=True)
print(label, 'num_boost_rounds =', len(cv))
bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
ypred += bst.predict(dpred)
df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
print('restored', label, '{{{', df[label].value_counts(), '}}}')
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
cname2 = cname + str(s)
v[cname2], z[cname2] = 0, 0
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname2] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname2] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname2] /= N_splits
vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def xgb1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 9
N_seeds = 4
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
xgb_params = dict(
max_depth = 5,
learning_rate = 0.02,
alpha = 0.01,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
for s in range(N_seeds):
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
dtest = xgb.DMatrix(test2)
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb2(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 9
N_seeds = 4
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
xgb_params = dict(
max_depth = 4,
learning_rate = 0.02,
subsample = 0.7,
alpha = 0.015,
#colsample_bytree = 0.8,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb3(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 9
N_seeds = 4
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
xgb_params = dict(
max_depth = 4,
learning_rate = 0.02,
subsample = 0.8,
colsample_bytree = 0.8,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def restore_missing(df, N_splits = 10):
xgb_params = dict(
max_depth = 5,
learning_rate = 0.005,
gamma = 1,
alpha = 0.01,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
#{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
df.ix[df.active == -1, 'active'] = 1
df.ix[df.alco == -1, 'alco'] = 0
label = 'smoke'
print('before', label, '{{{', df[label].value_counts(), '}}}')
xtrain = df[df[label] > -1].copy()
ytrain = xtrain[label].astype('int32').values
xtrain = xtrain.drop(label, axis=1)
#print(label, ytrain.value_counts())
xpred = df[df[label] == -1].copy()
ypred = xpred[label] * 0
xpred = xpred.drop(label, axis=1)
dpred = xgb.DMatrix(xpred)
dtrain = xgb.DMatrix(xtrain, label=ytrain)
cv = xgb.cv(params=xgb_params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=100,
nfold=10,
metrics='error',
stratified=True)
print(label, 'num_boost_rounds =', len(cv))
bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
ypred += bst.predict(dpred)
df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
print('restored', label, '{{{', df[label].value_counts(), '}}}')
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
cname2 = cname + str(s)
v[cname2], z[cname2] = 0, 0
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname2] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname2] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname2] /= N_splits
vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def restore_missing(df, N_splits = 10):
xgb_params = dict(
max_depth = 5,
learning_rate = 0.005,
gamma = 1,
alpha = 0.01,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
#{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
df.ix[df.active == -1, 'active'] = 1
df.ix[df.alco == -1, 'alco'] = 0
label = 'smoke'
print('before', label, '{{{', df[label].value_counts(), '}}}')
xtrain = df[df[label] > -1].copy()
ytrain = xtrain[label].astype('int32').values
xtrain = xtrain.drop(label, axis=1)
#print(label, ytrain.value_counts())
xpred = df[df[label] == -1].copy()
ypred = xpred[label] * 0
xpred = xpred.drop(label, axis=1)
dpred = xgb.DMatrix(xpred)
dtrain = xgb.DMatrix(xtrain, label=ytrain)
cv = xgb.cv(params=xgb_params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=100,
nfold=10,
metrics='error',
stratified=True)
print(label, 'num_boost_rounds =', len(cv))
bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
ypred += bst.predict(dpred)
df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
print('restored', label, '{{{', df[label].value_counts(), '}}}')
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
cname2 = cname + str(s)
v[cname2], z[cname2] = 0, 0
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname2] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname2] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname2] /= N_splits
vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb2(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
N_splits = 9
N_seeds = 4
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
dtrain = xgb.DMatrix(train2, y)
def step_xgb(params):
cv = xgb.cv(params=params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=100,
nfold=10,
seed=params['seed'])
score = cv.ix[len(cv)-1, 0]
print(cname, score, len(cv), params)
return dict(loss=score, status=STATUS_OK)
space_xgb = dict(
max_depth = hp.choice('max_depth', range(2, 8)),
subsample = hp.quniform('subsample', 0.6, 1, 0.05),
colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05),
learning_rate = hp.quniform('learning_rate', 0.005, 0.03, 0.005),
min_child_weight = hp.quniform('min_child_weight', 1, 6, 1),
gamma = hp.quniform('gamma', 0.5, 10, 0.05),
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
trs = load_state(cname + '_trials')
if trs == None:
tr = Trials()
else:
tr, _ = trs
if len(tr.trials) > 0: print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_xgb, tr.argmin))
for n in range(5):
best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
save_state(cname + '_trials', (tr, space_xgb))
xgb_params = space_eval(space_xgb, best)
print(xgb_params)
xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params)
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def restore_missing(df, N_splits = 10):
xgb_params = dict(
max_depth = 5,
learning_rate = 0.005,
gamma = 1,
alpha = 0.01,
objective = 'binary:logistic',
eval_metric = 'logloss',
seed = 1,
silent = 1
)
#{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
df.ix[df.active == -1, 'active'] = 1
df.ix[df.alco == -1, 'alco'] = 0
label = 'smoke'
print('before', label, '{{{', df[label].value_counts(), '}}}')
xtrain = df[df[label] > -1].copy()
ytrain = xtrain[label].astype('int32').values
xtrain = xtrain.drop(label, axis=1)
#print(label, ytrain.value_counts())
xpred = df[df[label] == -1].copy()
ypred = xpred[label] * 0
xpred = xpred.drop(label, axis=1)
dpred = xgb.DMatrix(xpred)
dtrain = xgb.DMatrix(xtrain, label=ytrain)
cv = xgb.cv(params=xgb_params,
dtrain=dtrain,
num_boost_round=10000,
early_stopping_rounds=100,
nfold=10,
metrics='error',
stratified=True)
print(label, 'num_boost_rounds =', len(cv))
bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
ypred += bst.predict(dpred)
df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
print('restored', label, '{{{', df[label].value_counts(), '}}}')
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
cname2 = cname + str(s)
v[cname2], z[cname2] = 0, 0
xgb_params['seed'] = s + 4242
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname2] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname2] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname2] /= N_splits
vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())