def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
python类ShuffleSplit()的实例源码
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=15,
random_state=seed,
n_estimators=2500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += p
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += reg.predict_proba(test2)[:,1]
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=15,
random_state=seed,
n_estimators=2500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += p
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += np.log1p(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = 128,
epochs=10000,
validation_data=(xval, yval),
verbose=0,
callbacks=build_keras_fit_callbacks(model_path),
shuffle=True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 7
num_splits = 17
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 7
num_splits = 17
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=7,
random_state=seed,
n_estimators=1500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=15,
random_state=seed,
n_estimators=2500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += p
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += reg.predict_proba(test2)[:,1]
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=11,
random_state=seed,
n_estimators=1500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 7
num_splits = 17
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = 128,
epochs=10000,
validation_data=(xval, yval),
verbose=0,
callbacks=build_keras_fit_callbacks(model_path),
shuffle=True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=15,
random_state=seed,
n_estimators=2500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=11,
random_state=seed,
n_estimators=1500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds