def _cv_r0( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
python类KFold()的实例源码
def getConfidenceScores(features_train, labels_train, C):
train_confidence = []
#confidence scores for training data are computed using K-fold cross validation
kfold = KFold(features_train.shape[0], n_folds=10)
for train_index,test_index in kfold:
X_train, X_test = features_train[train_index], features_train[test_index]
y_train, y_test = labels_train[train_index], labels_train[test_index]
#train classifier for the subset of train data
m = SVM.train(X_train,y_train,c=C,k="linear")
#predict confidence for test data and append it to list
conf = m.decision_function(X_test)
for x in conf:
train_confidence.append(x)
return np.array(train_confidence)
#save pos scores
def test_cv():
"""Simple CV check."""
# XXX: don't use scikit-learn for tests.
X, y = make_regression()
cv = KFold(X.shape[0], 5)
glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
# check that it returns 5 scores
scores = cross_val_score(glm_normal, X, y, cv=cv)
assert_equal(len(scores), 5)
param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
{'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
10, base=np.exp(1))}]
glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
glmcv.fit(X, y)
def _calculate(self, X, y, categorical, metafeatures, helpers):
import sklearn.lda
if len(y.shape) == 1 or y.shape[1] == 1:
kf = cross_validation.StratifiedKFold(y, n_folds=10)
else:
kf = cross_validation.KFold(y.shape[0], n_folds=10)
accuracy = 0.
try:
for train, test in kf:
lda = sklearn.lda.LDA()
if len(y.shape) == 1 or y.shape[1] == 1:
lda.fit(X[train], y[train])
else:
lda = OneVsRestClassifier(lda)
lda.fit(X[train], y[train])
predictions = lda.predict(X[test])
accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
return accuracy / 10
except LinAlgError as e:
self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
return np.NaN
except ValueError as e:
self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
return np.NaN
def _calculate(self, X, y, categorical, metafeatures, helpers):
import sklearn.naive_bayes
if len(y.shape) == 1 or y.shape[1] == 1:
kf = cross_validation.StratifiedKFold(y, n_folds=10)
else:
kf = cross_validation.KFold(y.shape[0], n_folds=10)
accuracy = 0.
for train, test in kf:
nb = sklearn.naive_bayes.GaussianNB()
if len(y.shape) == 1 or y.shape[1] == 1:
nb.fit(X[train], y[train])
else:
nb = OneVsRestClassifier(nb)
nb.fit(X[train], y[train])
predictions = nb.predict(X[test])
accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
return accuracy / 10
def _calculate(self, X, y, categorical, metafeatures, helpers):
import sklearn.tree
if len(y.shape) == 1 or y.shape[1] == 1:
kf = cross_validation.StratifiedKFold(y, n_folds=10)
else:
kf = cross_validation.KFold(y.shape[0], n_folds=10)
accuracy = 0.
for train, test in kf:
random_state = check_random_state(42)
tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)
if len(y.shape) == 1 or y.shape[1] == 1:
tree.fit(X[train], y[train])
else:
tree = OneVsRestClassifier(tree)
tree.fit(X[train], y[train])
predictions = tree.predict(X[test])
accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
return accuracy / 10
def _calculate(self, X, y, categorical, metafeatures, helpers):
import sklearn.tree
if len(y.shape) == 1 or y.shape[1] == 1:
kf = cross_validation.StratifiedKFold(y, n_folds=10)
else:
kf = cross_validation.KFold(y.shape[0], n_folds=10)
accuracy = 0.
for train, test in kf:
random_state = check_random_state(42)
node = sklearn.tree.DecisionTreeClassifier(
criterion="entropy", max_depth=1, random_state=random_state,
min_samples_split=1, min_samples_leaf=1, max_features=None)
if len(y.shape) == 1 or y.shape[1] == 1:
node.fit(X[train], y[train])
else:
node = OneVsRestClassifier(node)
node.fit(X[train], y[train])
predictions = node.predict(X[test])
accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
return accuracy / 10
def _calculate(self, X, y, categorical, metafeatures, helpers):
import sklearn.tree
if len(y.shape) == 1 or y.shape[1] == 1:
kf = cross_validation.StratifiedKFold(y, n_folds=10)
else:
kf = cross_validation.KFold(y.shape[0], n_folds=10)
accuracy = 0.
for train, test in kf:
random_state = check_random_state(42)
node = sklearn.tree.DecisionTreeClassifier(
criterion="entropy", max_depth=1, random_state=random_state,
min_samples_split=1, min_samples_leaf=1, max_features=1)
if len(y.shape) == 1 or y.shape[1] == 1:
node.fit(X[train], y[train])
else:
node = OneVsRestClassifier(node)
node.fit(X[train], y[train])
predictions = node.predict(X[test])
accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
return accuracy / 10
def rede_neural(X, y):
print("Iniciando treinamento da Rede Neural")
X2 = normalize(X)
clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', algorithm='adam', alpha=1e-5,
learning_rate='constant',tol=1e-8,learning_rate_init=0.0002,
early_stopping=True,validation_fraction=0.2)
kf = KFold(len(y),n_folds=3)
i = 0
for train,test in kf:
start = time.time()
i = i + 1
print("Treinamento",i)
# dividindo dataset em treino e test
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1)
X_train, X_test, y_train, y_test = X2[train], X2[test], y[train], y[test]
# fit
clf.fit(X_train, y_train)
print("score:",clf.score(X_test, y_test),"(",(time.time()-start)/60.0,"minutos )")
return clf
a30_pretrained_nets_pipeline_with_additional_data.py 文件源码
项目:KAGGLE_CERVICAL_CANCER_2017
作者: ZFTurbo
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def run_cross_validation_create_models(cnn, nfolds, submission_version):
from sklearn.cross_validation import KFold
files = glob.glob(INPUT_PATH + "*/*.jpg")
additional_files = glob.glob(INPUT_PATH_ADD + "*/*.jpg")
kf = KFold(len(files), n_folds=nfolds, shuffle=True, random_state=get_random_state(cnn))
num_fold = 0
sum_score = 0
print('Len of additional files: {}'.format(len(additional_files)))
for train_index, test_index in kf:
num_fold += 1
print('Start KFold number {} from {}'.format(num_fold, nfolds))
print('Split train: ', len(train_index))
print('Split valid: ', len(test_index))
score = train_single_model(cnn, num_fold, train_index, test_index, files, additional_files, submission_version)
sum_score += score
print('Avg loss: {}'.format(sum_score/nfolds))
def cross_validation_accuracy(clf, X, labels, k):
"""
Compute the average testing accuracy over k folds of cross-validation. You
can use sklearn's KFold class here (no random seed, and no shuffling
needed).
Params:
clf......A LogisticRegression classifier.
X........A csr_matrix of features.
labels...The true labels for each instance in X
k........The number of cross-validation folds.
Returns:
The average testing accuracy of the classifier
over each fold of cross-validation.
"""
###TODO
pass
def make_kfold(target, feature):
preds = []
kf = KFold(len(target), n_folds=folds,shuffle=True)
test_numbers = []
for trains, tests in kf:
test_numbers.append(tests)
pred_list = []
feature_list = word_vec.fit_transform([dict(Counter(feature[train])) for train in trains])
target_list = [target[train] for train in trains]
logreg.fit(feature_list, target_list)
for test in tests:
feature_dict = defaultdict(int)
for f in word_vec.get_feature_names():
feature_dict[f] = 0
for key, value in dict(Counter(feature[test])).items():
if key in feature_dict:
feature_dict[key] = value
pred_list.append(feature_dict)
preds.append(logreg.predict(word_vec.fit_transform(pred_list)))
return preds, test_numbers
def make_kfold(target, feature):
preds = []
kf = KFold(len(target), n_folds=folds,shuffle=True)
test_numbers = []
for trains, tests in kf:
test_numbers.append(tests)
pred_list = []
feature_list = word_vec.fit_transform([dict(Counter(feature[train])) for train in trains])
target_list = [target[train] for train in trains]
logreg.fit(feature_list, target_list)
for test in tests:
feature_dict = defaultdict(int)
for f in word_vec.get_feature_names():
feature_dict[f] = 0
for key, value in dict(Counter(feature[test])).items():
if key in feature_dict:
feature_dict[key] = value
pred_list.append(feature_dict)
preds.append(logreg.predict(word_vec.fit_transform(pred_list)))
return preds, test_numbers
def eval_cv5(model, x, y):
kf = KFold(len(y), n_folds=5)
acc = np.array([])
pre = np.array([])
rec = np.array([])
f1 = np.array([])
for train_index, test_index in kf:
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
model.fit(x_train, y_train)
prediction = model.predict(x_test)
evaluation = get_eval(prediction, y_test)
acc = np.append(acc, np.array(evaluation[0]))
pre = np.append(pre, np.array(evaluation[1]))
rec = np.append(rec, np.array(evaluation[2]))
f1 = np.append(f1, np.array(evaluation[3]))
return acc.mean(), pre.mean(), rec.mean(), f1.mean()
def __init__(self, estimator_cls, parameter_grid, score_fns,
nfolds=10, shuffle=False, seed=None, njobs=1,
checkpoint_path=None):
self.estimator_cls = estimator_cls
self.parameter_grid = parameter_grid
self.nfolds = nfolds
self.seed = seed
assert njobs == 1, "# jobs > 1 not supported."
self.njobs = njobs
assert _is_arraylike(score_fns)
self.score_fns = score_fns
self.checkpoint_path = checkpoint_path
self.grid_scores = None
self.kf = KFold(n_folds=self.nfolds,
shuffle=shuffle,
random_state=seed)
def cached_run(steps, X, y):
step_identifier = ''
# split data
n = len(y)
kf = KFold(n, _n_fold, random_state=_random_state)
folded_data = [(X[train_index], y[train_index], X[test_index], y[test_index]) for train_index, test_index in kf]
# last step is estimator, handle separately
for step in steps[:-1]:
step_identifier += "/%s" % _step_identifier(step)
logger.info("Processing %s", step_identifier)
folded_data = run_step_on_demand(step_identifier, step, folded_data)
scores = []
estimator = steps[-1]
step_identifier += "/%s" % _step_identifier(estimator)
for (X_train, y_train, X_test, y_test) in folded_data:
estimator.fit(X_train, y_train)
scores.append(estimator.score(X_test, y_test))
score = np.mean(scores)
logger.info("score of %s is %r", step_identifier, score)
return score
def k_fold_sample_data_set(x, y, folds):
"""
This function uses a k-fold approach as a re-sampling strategy
:param x: numpy array
- Includes the train data
:param y: numpy array
- Includes the actual value of each data sample
:param folds: integer
- The number of folds that splits the data set
:return: list of lists
- The training and test samples extracted from the training set
"""
x_train_list, y_train_list, x_test_list, y_test_list = list(), list(), list(), list()
try:
kf = KFold(x.shape[0], n_folds=folds, shuffle=True)
for train_index, test_index in kf:
x_train_list.append(x[train_index])
y_train_list.append(y[train_index])
x_test_list.append(x[test_index])
y_test_list.append(y[test_index])
return x_train_list, y_train_list, x_test_list, y_test_list
except AttributeError as e:
print(e.args, "- Please, use numpy arrays as inputs")
exit()
def run_example():
data, target = _get_data()
n_folds = 5
accuracy = 0.0
for (train_idx, test_idx) in KFold(n=len(data), n_folds=n_folds, shuffle=True):
train_X = data[train_idx]
train_y = target[train_idx]
test_X = data[test_idx]
test_y = target[test_idx]
model = SGDClassifier()
model.fit(train_X, train_y)
predictions = model.predict(test_X)
accuracy += accuracy_score(predictions, test_y)
return accuracy / n_folds
def run_example():
data, target = _get_data()
n_folds = 5
accuracy = 0.0
for (train_idx, test_idx) in KFold(n=len(data), n_folds=n_folds, shuffle=True):
train_X = data[train_idx]
train_y = target[train_idx]
test_X = data[test_idx]
test_y = target[test_idx]
model = SGDClassifier()
model.fit(train_X, train_y)
predictions = model.predict(test_X)
accuracy += accuracy_score(predictions, test_y)
return accuracy / n_folds
def kfold_train_and_predict(X, Y, classifier, k = 5, indices = None, features = None):
if indices is None:
indices = np.array(list(range(X.shape[0])))
if features is None:
features = np.array(list(range(X.shape[1])))
kf = cross_validation.KFold(len(indices), n_folds=k)
accurs = []
for train, test in kf:
train_ind = indices[train].astype("int")
test_ind = indices[test].astype("int")
classifier.fit(X[train_ind,:][:,features], Y[train_ind])
accurs += [classifier.score(X[test_ind,:][:,features], Y[test_ind])]
accurs = np.array(accurs)
return np.mean(accurs), np.std(accurs)
def run_model(model,dtrain,predictor_var,target,scoring_method='mean_squared_error'):
cv_method = KFold(len(dtrain),5)
cv_scores = cross_val_score(model,dtrain[predictor_var],dtrain[target],cv=cv_method,scoring=scoring_method)
#print cv_scores, np.mean(cv_scores), np.sqrt((-1)*np.mean(cv_scores))
dtrain_for_val = dtrain[dtrain['Year']<2000]
dtest_for_val = dtrain[dtrain['Year']>1999]
#cv_method = KFold(len(dtrain_for_val),5)
#cv_scores_2 = cross_val_score(model,dtrain_for_val[predictor_var],dtrain_for_val[target],cv=cv_method,scoring=scoring_method)
#print cv_scores_2, np.mean(cv_scores_2)
dtrain_for_val_ini = dtrain_for_val[predictor_var]
dtest_for_val_ini = dtest_for_val[predictor_var]
model.fit(dtrain_for_val_ini,dtrain_for_val[target])
pred_for_val = model.predict(dtest_for_val_ini)
#print math.sqrt(mean_squared_error(dtest_for_val['Footfall'],pred_for_val))
methods.py 文件源码
项目:South-African-Heart-Disease-data-analysis-using-python
作者: khushi4tiwari
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def getTestAndTrainingSet(X,y,K=10):
N = len(X)
CV = cross_validation.KFold(N,K,shuffle=True)
k=0
for train_index, test_index in CV:
# extract training and test set for current CV fold
X_train = X[train_index,:]
y_train = y[train_index,:]
X_test = X[test_index,:]
y_test = y[test_index,:]
k+=1
if(k==K):
return (X_train,y_train),(X_test,y_test)
def cv(feature_dict, feature, polarity, folds):
kfold = KFold(len(polarity), n_folds = folds)
count, f1, recall, precision, accuracy = 0, 0, 0, 0, 0
for train, test in kfold:
LR = LogisticRegression()
count += 1
x = [(feature[i]) for i in train]
y = [(polarity[i])for i in train]
LR.fit(scipy.sparse.vstack(x), (y))
test_label = []
answer_label = [(polarity[j]) for j in test]
for j in test:
query = feature[j]
result = -1 if query.shape[1] != len(feature_dict) else predict(LR, query)
test_label.append(int(result[0]))
accuracy += accuracy_score(answer_label, test_label)
precision += precision_score(answer_label, test_label)
recall += recall_score(answer_label, test_label)
f1 += f1_score(answer_label, test_label)
print('{}_fold finished.'.format(count))
return accuracy, precision, recall, f1
def cv(feature_dict, feature, polarity, folds):
kfold = KFold(len(polarity), n_folds = folds)
count, f1, recall, precision, accuracy = 0, 0, 0, 0, 0
for train, test in kfold:
LR = LogisticRegression()
count += 1
x = [(feature[i]) for i in train]
y = [(polarity[i])for i in train]
LR.fit(scipy.sparse.vstack(x), (y))
test_label = []
answer_label = [(polarity[j]) for j in test]
for j in test:
query = feature[j]
result = -1 if query.shape[1] != len(feature_dict) else predict(LR, query)
test_label.append(result[1][1])
pre, rec, thr = precision_recall_curve(answer_label, test_label)
return pre, rec, thr
return accuracy, precision, recall, f1
convolutional_sparseFiltering.py 文件源码
项目:hco-experiments
作者: zooniverse
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def cross_validate_Softmax(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):
from sklearn.cross_validation import KFold
m = len(np.squeeze(Y))
CGrid = [0.1, 0.03, 0.01, 0.003, 0.001, 3e-4, 1e-4, 3e-5, 1e-5]
kf = KFold(m, n_folds=n_folds)
mean_FoMs = []
for C in CGrid:
fold = 1
FoMs = []
for train, test in kf:
print("[+] training Softmax: LAMBDA : %e, fold : %d" % (C, fold))
prefix = "cv/cv_fold%d" % fold
FoM, threshold = train_Softmax(C, dataFile, X[train], Y[train], X[test], Y[test], \
pooledFile, imageDim, sgd, prefix=prefix)
FoMs.append(FoM)
fold += 1
mean_FoMs.append(np.mean(FoMs))
best_FoM_index = np.argmin(mean_FoMs)
print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index])
return CGrid[best_FoM_index]
convolutional_sparseFiltering.py 文件源码
项目:hco-experiments
作者: zooniverse
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def cross_validate_SoftMaxOnline(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):
from sklearn.cross_validation import KFold
m = len(np.squeeze(Y))
CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
kf = KFold(m, n_folds=n_folds, indices=False)
mean_FoMs = []
for C in CGrid:
fold = 1
FoMs = []
for train, test in kf:
print("[+] training SoftMaxOnline: LAMBDA : %e, fold : %d" % (C, fold))
prefix = "cv/cv_fold%d" % fold
FoM, threshold = train_SoftMaxOnline(C, dataFile, X[train], Y[train], X[test], Y[test], \
pooledFile, imageDim, sgd, prefix=prefix)
FoMs.append(FoM)
fold += 1
mean_FoMs.append(np.mean(FoMs))
best_FoM_index = np.argmin(mean_FoMs)
print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index])
return CGrid[best_FoM_index]
convolutional_sparseFiltering.py 文件源码
项目:hco-experiments
作者: zooniverse
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def cross_validate_linearSVM(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):
from sklearn.cross_validation import KFold
m = len(np.squeeze(Y))
CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
kf = KFold(m, n_folds=n_folds, indices=False)
mean_FoMs = []
for C in CGrid:
fold = 1
FoMs = []
for train, test in kf:
print("[+] training linear SVM: C : %e, fold : %d" % (C, fold))
prefix = "cv/cv_fold%d" % fold
FoM, threshold = train_linearSVM(C, dataFile, X[train], Y[train], X[test], Y[test], \
pooledFile, imageDim, sgd, prefix=prefix)
FoMs.append(FoM)
fold += 1
mean_FoMs.append(np.mean(FoMs))
best_FoM_index = np.argmin(mean_FoMs)
print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index])
return CGrid[best_FoM_index]
convolutional_sparseFiltering.py 文件源码
项目:hco-experiments
作者: zooniverse
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def cross_validate_Softmax(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):
from sklearn.cross_validation import KFold
m = len(np.squeeze(Y))
CGrid = [0.1, 0.03, 0.01, 0.003, 0.001, 3e-4, 1e-4, 3e-5, 1e-5]
kf = KFold(m, n_folds=n_folds, indices=False)
mean_FoMs = []
for C in CGrid:
fold = 1
FoMs = []
for train, test in kf:
print "[+] training Softmax: LAMBDA : %e, fold : %d" % (C, fold)
prefix = "cv/cv_fold%d" % fold
FoM, threshold = train_Softmax(C, dataFile, X[train], Y[train], X[test], Y[test], \
pooledFile, imageDim, sgd, prefix=prefix)
FoMs.append(FoM)
fold += 1
mean_FoMs.append(np.mean(FoMs))
best_FoM_index = np.argmin(mean_FoMs)
print "[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index]
return CGrid[best_FoM_index]
convolutional_sparseFiltering.py 文件源码
项目:hco-experiments
作者: zooniverse
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def cross_validate_linearSVM(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):
from sklearn.cross_validation import KFold
m = len(np.squeeze(Y))
CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
kf = KFold(m, n_folds=n_folds, indices=False)
mean_FoMs = []
for C in CGrid:
fold = 1
FoMs = []
for train, test in kf:
print "[+] training linear SVM: C : %e, fold : %d" % (C, fold)
prefix = "cv/cv_fold%d" % fold
FoM, threshold = train_linearSVM(C, dataFile, X[train], Y[train], X[test], Y[test], \
pooledFile, imageDim, sgd, prefix=prefix)
FoMs.append(FoM)
fold += 1
mean_FoMs.append(np.mean(FoMs))
best_FoM_index = np.argmin(mean_FoMs)
print "[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index]
return CGrid[best_FoM_index]
def knn_cv(post_features, post_class, n_folds, n_neighbors, length_dataset = -1):
if(length_dataset == -1):
length_dataset = len(post_class)
cv = KFold(n = length_dataset, n_folds = n_folds, shuffle = True)
train_accuracy = []
test_accuracy = []
for train,test in cv:
knn = neighbors.KNeighborsClassifier(n_neighbors = n_neighbors)
knn.fit(post_features[train],post_class[train])
train_accuracy.append(knn.score(post_features[train], post_class[train]))
test_accuracy.append(knn.score(post_features[test], post_class[test]))
# return (sum(train_accuracy)/n_folds), (sum(test_accuracy)/n_folds)
return np.mean(train_accuracy), np.mean(test_accuracy)