def cross_validation():
M = read_dataset()
n_fold = 10
rating_idx = np.array(M.nonzero()).T
kf = KFold(n_splits=n_fold, random_state=0)
with tf.Session() as sess:
model = VAEMF(sess, num_user, num_item,
hidden_encoder_dim=hidden_encoder_dim, hidden_decoder_dim=hidden_decoder_dim,
latent_dim=latent_dim, output_dim=output_dim, learning_rate=learning_rate, batch_size=batch_size, reg_param=reg_param)
for i, (train_idx, test_idx) in enumerate(kf.split(rating_idx)):
print("{0}/{1} Fold start| Train size={2}, Test size={3}".format(i,
n_fold, train_idx.size, test_idx.size))
model.train(M, train_idx=train_idx,
test_idx=test_idx, n_steps=n_steps)
python类KFold()的实例源码
def cross_validation():
M = read_dataset()
n_fold = 10
rating_idx = np.array(M.nonzero()).T
kf = KFold(n_splits=n_fold, random_state=0)
with tf.Session() as sess:
model = VAEMF(sess, num_user, num_item,
hidden_encoder_dim=hidden_encoder_dim, hidden_decoder_dim=hidden_decoder_dim,
latent_dim=latent_dim, output_dim=output_dim, learning_rate=learning_rate, batch_size=batch_size, reg_param=reg_param, one_hot=one_hot)
for i, (train_idx, test_idx) in enumerate(kf.split(rating_idx)):
print("{0}/{1} Fold start| Train size={2}, Test size={3}".format(i,
n_fold, train_idx.size, test_idx.size))
model.train(M, train_idx=train_idx,
test_idx=test_idx, n_steps=n_steps)
def kfold_train(self, n_splits=3):
logger.info('train classifier using kFold')
kf = KFold(n_splits=n_splits, shuffle=True)
scores = []
precisions = []
recalls = []
for train_index, test_index in kf.split(self.data):
train_text = self.data.iloc[train_index]['text'].values
train_y = self.data.iloc[train_index]['class'].values
test_text = self.data.iloc[test_index]['text'].values
test_y = self.data.iloc[test_index]['class'].values
self.cls.train(train_text, train_y)
predictions = self.cls.predict(test_text)
self.confusion += confusion_matrix(test_y, predictions)
scores.append(f1_score(test_y, predictions, pos_label='geography'))
recalls.append(recall_score(test_y, predictions, pos_label='geography'))
precisions.append(precision_score(test_y, predictions, pos_label='geography'))
self.score = sum(scores) / len(scores)
self.precision = sum(precisions) / len(precisions)
self.recall = sum(recalls) / len(recalls)
return self.cls
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42):
self.name = name
self.X = X
self.y = y
self.task = task
self.random_state = random_state
if test_size is not None:
self.test_size = test_size
self.validation_method = "train_test_split"
self.X_train, self.X_test, self.y_train, self.y_test = \
model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
elif cv is not None:
self.validation_method = "cv"
if task == "regression":
self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state)
elif task == "classification":
self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
def set_kfold(self, no_folds = 10, fold_id = 0):
inst = KFold(n_splits = no_folds, shuffle=True, random_state=125)
self.fold_id = fold_id
self.KFolds = list(inst.split(np.arange(self.no_samples)))
self.train_idx, self.test_idx = self.KFolds[fold_id]
self.no_samples_train = self.train_idx.shape[0]
self.no_samples_test = self.test_idx.shape[0]
self.print_ext('Data ready. no_samples_train:', self.no_samples_train, 'no_samples_test:', self.no_samples_test)
if self.train_batch_size == 0:
self.train_batch_size = self.no_samples_train
if self.test_batch_size == 0:
self.test_batch_size = self.no_samples_test
self.train_batch_size = min(self.train_batch_size, self.no_samples_train)
self.test_batch_size = min(self.test_batch_size, self.no_samples_test)
# This function is cropped before batch
# Slice each sample to improve performance
a25_unet_training_v2_on_boxes.py 文件源码
项目:KAGGLE_CERVICAL_CANCER_2017
作者: ZFTurbo
项目源码
文件源码
阅读 116
收藏 0
点赞 0
评论 0
def run_cross_validation_create_models_unet2(nfolds=5):
from sklearn.model_selection import KFold
files_full = glob.glob(INPUT_PATH + "*/*.png")
files = []
for f in files_full:
if '_mask' not in f:
continue
files.append(f)
kf = KFold(n_splits=nfolds, shuffle=True, random_state=66)
num_fold = 0
sum_score = 0
for train_index, test_index in kf.split(range(len(files))):
num_fold += 1
print('Start KFold number {} from {}'.format(num_fold, nfolds))
print('Split train: ', len(train_index))
print('Split valid: ', len(test_index))
if num_fold != 2:
continue
score = train_single_model(num_fold, train_index, test_index, files)
sum_score += score
print('Avg loss: {}'.format(sum_score/nfolds))
a25_unet_training_v1_on_my_segmentation.py 文件源码
项目:KAGGLE_CERVICAL_CANCER_2017
作者: ZFTurbo
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def run_cross_validation_create_models_unet1(nfolds=5):
from sklearn.model_selection import KFold
files_full = glob.glob(INPUT_PATH + "*/*.png")
files = []
for f in files_full:
if '_mask' in f:
continue
files.append(f)
kf = KFold(n_splits=nfolds, shuffle=True, random_state=66)
num_fold = 0
sum_score = 0
for train_index, test_index in kf.split(range(len(files))):
num_fold += 1
print('Start KFold number {} from {}'.format(num_fold, nfolds))
print('Split train: ', len(train_index))
print('Split valid: ', len(test_index))
score = train_single_model(num_fold, train_index, test_index, files)
sum_score += score
print('Avg loss: {}'.format(sum_score/nfolds))
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def cross_validate(classifier, n_folds = 5):
'''Custom cross-validation module I always use '''
train_X = classifier['train_X']
train_y = classifier['train_y']
model = classifier['model']
score = 0.0
skf = KFold(n_splits = n_folds)
for train_index, test_index in skf.split(train_X):
X_train, X_test = train_X[train_index], train_X[test_index]
y_train, y_test = train_y[train_index], train_y[test_index]
clf = model.fit(X_train,y_train)
pred = clf.predict_proba(X_test)[:,1]
#print 'cross', roc_auc_score(y_test,pred)
score = score + roc_auc_score(y_test,pred)
return score/n_folds
def fit(self, X, y):
self.base_models_ = [list() for x in self.base_models]
self.meta_model_ = clone(self.meta_model)
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=15)
# train cloned base models then create out-of-fold predictions that are needed to train the cloned meta-model
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
for i, model in enumerate(self.base_models):
for train_index, holdout_index in kfold.split(X, y):
instance = clone(model)
self.base_models_[i].append(instance)
instance.fit(X[train_index], y[train_index])
y_pred = instance.predict(X[holdout_index])
out_of_fold_predictions[holdout_index, i] = y_pred
# now train the cloned meta-model using the out-of-fold predictions as new feature
self.meta_model_.fit(out_of_fold_predictions, y)
return self
# do the predictions of all base models on the test data and use the averaged predictions as
#meta-features for the final prediction which is done by the meta-model
def train_cross_validation(args, sess, model, phi_xs_train, ys_train):
kf = KFold(n_splits=args.K)
w_best = None
validation_loss = 0
for train_index, validation_index in kf.split(phi_xs_train):
sess.run(tf.global_variables_initializer())
model.fit(sess, phi_xs_train[train_index], ys_train[train_index], epoch=args.epoch, batch_size=args.batch_size)
loss = model.eval(sess, phi_xs_train[validation_index], ys_train[validation_index])
logging.info('Validation loss = %f' % (loss))
validation_loss += loss
model.reset(sess)
return validation_loss / float(args.K)
def evaluate(self, individual):
#print(" *** evaluate *** ")
#model = individual.createNetwork()
#return random.random(),
random.seed(42)
# perform KFold crossvalidation
kf = KFold(n_splits=3)
scores = []
for train, test in kf.split(self.X): # train, test are indicies
X_train, X_test = self.X[train], self.X[test]
y_train, y_test = self.y[train], self.y[test]
model = individual.createNetwork()
model.fit(X_train, y_train,
batch_size=Config.batch_size, nb_epoch=Config.epochs, verbose=0)
yy_test = model.predict(X_test)
scores.append(error(y_test, yy_test))
fitness = np.mean(scores)
return fitness,
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
"""K-Folds cross validation iterator.
Parameters
----------
k : int, default 5
stratify : bool, default False
shuffle : bool, default True
seed : int, default 33
Yields
-------
X_train, y_train, X_test, y_test, train_index, test_index
"""
if stratify:
kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
else:
kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)
for train_index, test_index in kf.split(self.X_train, self.y_train):
X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
yield X_train, y_train, X_test, y_test, train_index, test_index
def do_kfold(proc_images, proc_labels, split=10):
trainimages = []
trainlabels = []
testimages = []
testlabels = []
rand_idx = random.sample(range(0, len(proc_images)), len(proc_images))
proc_images = proc_images[rand_idx]
proc_labels = proc_labels[rand_idx]
kf = KFold(n_splits=split)
for train_index, test_index in kf.split(proc_images):
x_train, x_test = proc_images[train_index], proc_images[test_index]
y_train, y_test = proc_labels[train_index], proc_labels[test_index]
trainimages.append(x_train)
testimages.append(x_test)
trainlabels.append(y_train)
testlabels.append(y_test)
np.save("trainimages.npy", trainimages)
np.save("testimages.npy", testimages)
np.save("trainlabels.npy", trainlabels)
np.save("testlabels.npy", testlabels)
return(trainimages, testimages, trainlabels, testlabels)
cnn_expression_bn_dropout.py 文件源码
项目:convneuralnetwork
作者: clutariomark
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def do_kfold(proc_images, proc_labels, split=10):
trainimages = []
trainlabels = []
testimages = []
testlabels = []
rand_idx = random.sample(range(0, len(proc_images)), len(proc_images))
proc_images = proc_images[rand_idx]
proc_labels = proc_labels[rand_idx]
kf = KFold(n_splits=split)
for train_index, test_index in kf.split(proc_images):
x_train, x_test = proc_images[train_index], proc_images[test_index]
y_train, y_test = proc_labels[train_index], proc_labels[test_index]
trainimages.append(x_train)
testimages.append(x_test)
trainlabels.append(y_train)
testlabels.append(y_test)
np.save("trainimages.npy", trainimages)
np.save("testimages.npy", testimages)
np.save("trainlabels.npy", trainlabels)
np.save("testlabels.npy", testlabels)
return(trainimages, testimages, trainlabels, testlabels)
def do_kfold(proc_images, proc_labels, split=10):
trainimages = []
trainlabels = []
testimages = []
testlabels = []
rand_idx = random.sample(range(0, len(proc_images)), len(proc_images))
proc_images = proc_images[rand_idx]
proc_labels = proc_labels[rand_idx]
kf = KFold(n_splits=split)
for train_index, test_index in kf.split(proc_images):
x_train, x_test = proc_images[train_index], proc_images[test_index]
y_train, y_test = proc_labels[train_index], proc_labels[test_index]
trainimages.append(x_train)
testimages.append(x_test)
trainlabels.append(y_train)
testlabels.append(y_test)
np.save("trainimages.npy", trainimages)
np.save("testimages.npy", testimages)
np.save("trainlabels.npy", trainlabels)
np.save("testlabels.npy", testlabels)
return(trainimages, testimages, trainlabels, testlabels)
cnn_expression_batchnorm.py 文件源码
项目:convneuralnetwork
作者: clutariomark
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def do_kfold(proc_images, proc_labels, split=10):
trainimages = []
trainlabels = []
testimages = []
testlabels = []
rand_idx = random.sample(range(0, len(proc_images)), len(proc_images))
proc_images = proc_images[rand_idx]
proc_labels = proc_labels[rand_idx]
kf = KFold(n_splits=split)
for train_index, test_index in kf.split(proc_images):
x_train, x_test = proc_images[train_index], proc_images[test_index]
y_train, y_test = proc_labels[train_index], proc_labels[test_index]
trainimages.append(x_train)
testimages.append(x_test)
trainlabels.append(y_train)
testlabels.append(y_test)
np.save("trainimages.npy", trainimages)
np.save("testimages.npy", testimages)
np.save("trainlabels.npy", trainlabels)
np.save("testlabels.npy", testlabels)
return(trainimages, testimages, trainlabels, testlabels)
TwitterClassificationProject.py 文件源码
项目:twitter-text-classification
作者: FurkanArslan
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def testClassificationQuality(self):
score = 0
kfold = KFold(n_splits=10, shuffle=True, random_state=0)
tweetClassification = TweetClassification()
for ind_train, ind_test in kfold.split(self.tweets):
dataTest = self.tweets[ind_test]
dataTrain = self.tweets[ind_train]
targetTest = self.target[ind_test]
targetTrain = self.target[ind_train]
tweetClassification.fit(dataTrain, targetTrain)
score += tweetClassification.score(dataTest, targetTest)
return score / 10
BenchmarkTests.py 文件源码
项目:twitter-text-classification
作者: FurkanArslan
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def evaluate_cross_validation(self, clf, data, target, cluster):
score = 0
kfold = KFold(n_splits=cluster, shuffle=True, random_state=0)
for ind_train, ind_test in kfold.split(data):
dataTest = data[ind_test]
dataTrain = data[ind_train]
targetTest = target[ind_test]
targetTrain = target[ind_train]
clf.fit(dataTrain, targetTrain)
score += clf.score(dataTest, targetTest)
print ('-'*30)
print ("Mean score: %0.3f" % (score/10))
print ('-'*30)
return score/10
def test_cross_val_predict():
# Make sure it works in cross_val_predict for multiclass.
X, y = load_iris(return_X_y=True)
y = LabelBinarizer().fit_transform(y)
X = StandardScaler().fit_transform(X)
mlp = MLPClassifier(n_epochs=10,
solver_kwargs={'learning_rate': 0.05},
random_state=4567).fit(X, y)
cv = KFold(n_splits=4, random_state=457, shuffle=True)
y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
auc = roc_auc_score(y, y_oos, average=None)
assert np.all(auc >= 0.96)
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True):
"""
Grid search method with numpy array of X and Y
Previously, np.mat are used for compatible with Matlab notation.
"""
if disp:
print( X.shape, Y.shape)
clf = getattr( linear_model, method)()
parmas = {'alpha': np.logspace( *alphas_log)}
kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf5 = kf5_c.split( X)
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)
gs.fit( X, Y)
return gs
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = svm.SVR( **svr_params)
kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1):
"""
gs = gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1)
Inputs
======
classifier = svm.SVC(), for example
param = {"C": np.logspace(-2,2,5)}
"""
#print(xM.shape, yVc.shape)
kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=True)
gs = model_selection.GridSearchCV( classifier, params, cv=kf5_c, n_jobs=n_jobs)
gs.fit( xM, yVc)
return gs
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, XX)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf_n = kf5_ext_c.split( A_list[0])
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, X_concat)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf_n = kf5_ext_c.split( A_list[0])
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
n_splits = xM.shape[0]
# print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True):
"""
Grid search method with numpy array of X and Y
Previously, np.mat are used for compatible with Matlab notation.
"""
if disp:
print( X.shape, Y.shape)
clf = getattr( linear_model, method)()
parmas = {'alpha': np.logspace( *alphas_log)}
kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf5 = kf5_c.split( X)
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)
gs.fit( X, Y)
return gs
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = svm.SVR( **svr_params)
kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
kutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1, graph=False):
"""
gs = gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1)
Inputs
======
model = svm.SVC(), or linear_model.LinearRegression(), for example
param = {"C": np.logspace(-2,2,5)}
"""
#print(xM.shape, yVc.shape)
kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
gs = model_selection.GridSearchCV( model, param_grid, cv=kf5_c, n_jobs=n_jobs)
gs.fit( X, y)
if graph:
plt.plot( gs.cv_results_["mean_train_score"], label='E[Train]')
plt.plot( gs.cv_results_["mean_test_score"], label='E[Test]')
plt.legend(loc=0)
plt.grid()
return gs