def validate(model, X, y, nb_epoch=25, batch_size=128,
stop_early=True, folds=10, test_size=None, shuffle=True, verbose=True):
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
total_score = []
if test_size is None:
if folds == 1:
test_size = 0.25
else:
test_size = 1 - (1. / folds)
kf = ShuffleSplit(n_splits=folds, test_size=test_size)
for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
shuffle_weights(model)
if fold > 0:
print("FOLD:", fold)
print("-" * 40)
model.reset_states()
callbacks = [early_stopping] if True else None
hist = model.fit(X[train_index], y[train_index], batch_size=batch_size, shuffle=shuffle,
validation_data=(X[test_index], y[test_index]),
callbacks=[early_stopping], verbose=verbose)
total_score.append(hist.history["val_acc"][-1])
return np.mean(total_score)
python类ShuffleSplit()的实例源码
def train_test_split(inpath, train, test, split, random_seed):
"""
RuCor doesn't provide train/test data splitting, it makes random splitting.
Args:
inpath: path to data
train: path to train folder
test: path to test folder
split: int, split ratio
random_seed: seed for random module
Returns:
"""
print('Start train-test splitting ...')
z = os.listdir(inpath)
doc_split = ShuffleSplit(1, test_size=split, random_state=random_seed)
for train_indeses, test_indeses in doc_split.split(z):
train_set = [z[i] for i in sorted(list(train_indeses))]
test_set = [z[i] for i in sorted(list(test_indeses))]
for x in train_set:
build_data.move(os.path.join(inpath, x), os.path.join(train, x))
for x in test_set:
build_data.move(os.path.join(inpath, x), os.path.join(test, x))
print('End train-test splitts.')
return None
def TestPerformance(self, df = None):
#If no dataframe is provided, use the currently learned one
if(df is None):
D = self.D
else:
D = self.S.transform(df.copy())
#Get features from the data frame
A = self._ExtractFeat(D)
#Get the target values and their corresponding column names
y, _ = self._ExtractTarg(D)
#Begin cross validation
ss = ShuffleSplit(n_splits = 1)
for trn, tst in ss.split(A):
s1 = self.R.score(A, y)
s2 = self.R.score(A[tst], y[tst])
s3 = self.R.score(A[trn], y[trn])
print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
def Train(self, C, A, Y, SF):
'''
Train the classifier using the sample matrix A and target matrix Y
'''
C.fit(A, Y)
YH = np.zeros(Y.shape, dtype = np.object)
for i in np.array_split(np.arange(A.shape[0]), 32): #Split up verification into chunks to prevent out of memory
YH[i] = C.predict(A[i])
s1 = SF(Y, YH)
print('All:{:8.6f}'.format(s1))
'''
ss = ShuffleSplit(random_state = 1151) #Use fixed state for so training can be repeated later
trn, tst = next(ss.split(A, Y)) #Make train/test split
mi = [8] * 1 #Maximum number of iterations at each iter
YH = np.zeros((A.shape[0]), dtype = np.object)
for mic in mi: #Chunk size to split dataset for CV results
#C.SetMaxIter(mic) #Set the maximum number of iterations to run
#C.fit(A[trn], Y[trn]) #Perform training iterations
'''
def test_safe_split_with_precomputed_kernel():
clf = SVC()
clfp = SVC(kernel="precomputed")
X, y = iris.data, iris.target
K = np.dot(X, X.T)
cv = ShuffleSplit(test_size=0.25, random_state=0)
tr, te = list(cv.split(X))[0]
X_tr, y_tr = _safe_split(clf, X, y, tr)
K_tr, y_tr2 = _safe_split(clfp, K, y, tr)
assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))
X_te, y_te = _safe_split(clf, X, y, te, tr)
K_te, y_te2 = _safe_split(clfp, K, y, te, tr)
assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
def split_testing_data_r(y):
ss = ShuffleSplit(n_splits=1, test_size=0.2)
tri = None
tei = None
for itr, ite in ss.split(y):
tri = itr
tei = ite
return tri, tei
def get_cv_method(method, **kwargs):
if method == 'kfold':
return KFold(**kwargs)
elif method == 'skfold':
return StratifiedKFold(**kwargs)
elif method == 'loo':
return LeaveOneOut()
elif method == 'shuffle_split':
return ShuffleSplit(**kwargs)
elif method == 'split':
return TrainTestSplit(**kwargs)
elif method == 's_shuffle_split':
return StratifiedShuffleSplit(**kwargs)
elif method == 'time_series':
return TimeSeriesSplit(**kwargs)
else:
raise AttributeError('Invalid CV method - %s!' % method)
def cross_validation(self):
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20)
scores = cross_val_score(self.clf, self.training_data, self.training_target, cv=cv, scoring='f1_macro')
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def cross_validation(self):
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20)
scores = cross_val_score(self.clf, self.training_data, self.training_target, cv=cv, scoring='f1_macro')
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def cross_validation(self):
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20)
scores = cross_val_score(self.clf, self.training_data, self.training_target, cv=cv, scoring='f1_macro')
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def test_learning_curve_comprehensive(self):
"""
Test learning curve with all parameters with visual unit test.
"""
try:
visualizer = LearningCurveVisualizer(LinearSVC(random_state=0), train_sizes=np.linspace(.1, 1.0, 5),
cv=ShuffleSplit(n_splits=100, test_size=0.2, random_state=0),
n_jobs=4)
visualizer.fit(X, y)
visualizer.poof()
except Exception as e:
self.fail("error during learning curve: {}".format(e))
self.assert_images_similar(visualizer)
def test_learning_curve_model_cv_only(self):
"""
Test learning curve with inputting model and cv only.
"""
try:
visualizer = LearningCurveVisualizer(LinearSVC(),
cv=ShuffleSplit(n_splits=100, test_size=0.2, random_state=0))
visualizer.fit(X, y)
visualizer.poof()
except Exception as e:
self.fail("error during learning curve: {}".format(e))
def test_learning_curve_model_trainsize_cv_only(self):
"""
Test learning curve with inputting model, training size, and cv only.
"""
try:
visualizer = LearningCurveVisualizer(LinearSVC(),
train_sizes=np.linspace(.1, 1.0, 5),
cv=ShuffleSplit(n_splits=100, test_size=0.2, random_state=0))
visualizer.fit(X, y)
visualizer.poof()
except Exception as e:
self.fail("error during learning curve: {}".format(e))
def test_learning_curve_bad_trainsize(self):
"""
Test learning curve with bad input for training size.
"""
with self.assertRaises(YellowbrickError):
visualizer = LearningCurveVisualizer(LinearSVC(),
train_sizes=10000,
cv=ShuffleSplit(n_splits=100, test_size=0.2, random_state=0))
visualizer.fit(X, y)
visualizer.poof()
def get_cv(self, X, y):
unique_event_ids = np.unique(y[:, 0])
event_cv = ShuffleSplit(
n_splits=self.n_cv, test_size=self.cv_test_size,
random_state=self.random_state)
for train_event_is, test_event_is in event_cv.split(unique_event_ids):
train_is = np.where(
np.in1d(y[:, 0], unique_event_ids[train_event_is]))[0]
test_is = np.where(
np.in1d(y[:, 0], unique_event_ids[test_event_is]))[0]
yield train_is, test_is
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 7
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 7
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=6,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 1
num_splits = 3
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 1
num_splits = 3
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=7,
random_state=seed,
n_estimators=1500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=11,
random_state=seed,
n_estimators=1500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 2
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=11,
random_state=seed,
n_estimators=2000,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def et1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 5
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.ExtraTreesClassifier(max_depth=15,
random_state=seed,
n_estimators=2500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += p
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += reg.predict_proba(test2)[:,1]
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
v[cname], z[cname] = 0, 0
np.random.seed(seed)
build_model().summary(line_length=120)
model_path = '../data/working/' + cname + '_keras_model.h5'
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
scores = list()
for n, (itrain, ival) in enumerate(ss.split(train3, y)):
xtrain, xval = train3[itrain], train3[ival]
ytrain, yval = y[itrain], y[ival]
model = build_model()
model.fit(
xtrain, ytrain,
batch_size = batch_size,
epochs = 10000,
validation_data = (xval, yval),
verbose = 0,
callbacks = build_keras_fit_callbacks(model_path),
shuffle = True
)
model.load_weights(model_path)
p = model.predict(xval)
v.loc[ival, cname] += pconvert(p).ravel()
score = metrics.log_loss(y[ival], p)
print(cname, 'fold %d: '%(n+1), score, now())
scores.append(score)
z[cname] += pconvert(model.predict(test3)).ravel()
del model
for i in range(3): gc.collect(i)
os.remove(model_path)
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
scores = list()
num_seeds = 3
num_splits = 7
base_seed = 13
ss = model_selection.ShuffleSplit(n_splits=num_splits)
for seed in range(base_seed, base_seed + num_seeds):
ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
for n, (itrain, ival) in enumerate(ss.split(train2, y)):
reg = ensemble.RandomForestClassifier(max_depth=9,
random_state=seed,
n_estimators=500,
n_jobs=-2)
reg.fit(train2[itrain], y[itrain])
p = reg.predict_proba(train2[ival])[:,1]
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
scores.append(score)
z[cname] += pconvert(reg.predict_proba(test2)[:,1])
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= num_splits * num_seeds
v[cname] /= num_seeds