def train_and_calibrate_cv(model, X_tr, y_tr, cv=5):
y_pred_xval = np.zeros(len(y_tr))
skf = cross_validation.StratifiedKFold(y_tr, n_folds=cv,shuffle=True)
i = 0;
for train, test in skf:
i = i+1
print("training fold {} of {}".format(i, cv))
X_train_xval = np.array(X_tr)[train,:]
X_test_xval = np.array(X_tr)[test,:]
y_train_xval = np.array(y_tr)[train]
# We could also copy the model first and then fit it
model_copy = clone(model)
model_copy.fit(X_train_xval,y_train_xval)
y_pred_xval[test]=model.predict_proba(X_test_xval)[:,1]
print("training full model")
model_copy = clone(model)
model_copy.fit(X_tr,y_tr)
print("calibrating function")
calib_func = prob_calibration_function(y_tr, y_pred_xval)
return model_copy, calib_func
python类StratifiedKFold()的实例源码
def rf1(train2, y, test2, v, z):
cname = sys._getframe().f_code.co_name
v[cname], z[cname] = 0, 0
N_splits = 300
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
print('step %d of %d'%(n+1, skf.n_splits), now())
clf = ensemble.RandomForestRegressor(n_estimators=1000,
max_depth=3,
random_state=13)
clf.fit(train2[itrain], y[itrain])
p = clf.predict(train2[ival])
v.loc[ival, cname] += p
score = metrics.log_loss(y[ival], p)
z[cname] += np.log1p(clf.predict(test2))
print(cname, 'step %d: score'%(n+1), score, now())
scores.append(score)
print('validation loss: ', metrics.log_loss(y, v[cname]))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
z[cname] /= N_splits
def get_split(self):
if self.split is not None:
return
name = "{}/split.p".format(self.flags.data_path)
split = load_pickle(None,name,[])
if len(split) == 0:
#data = self.data["training_variants"].append(self.data["test_variants_filter"])
data = self.data["training_variants"]
y = data['Class']-1
X = np.arange(y.shape[0])
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=self.flags.folds,shuffle=True,random_state=99)
split = [(train_index, test_index) for train_index, test_index in skf.split(X, y)]
save_pickle(split,name)
print("new shuffle")
self.split = split
#print("split va",split[0][1][:10])
inbreast.py 文件源码
项目:deep-mil-for-whole-mammogram-classification
作者: wentaozhu
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def cvsplit(fold, totalfold, mydict):
'''get the split of train and test
fold is the returned fold th data, from 0 to totalfold-1
total fold is for the cross validation
mydict is the return dict from readlabel'''
skf = StratifiedKFold(n_splits=totalfold) # default shuffle is false, okay!
#readdicom(mydict)
y = mydict.values()
x = mydict.keys()
count = 0
for train, test in skf.split(x,y):
print(len(train), len(test))
if count == fold:
#print test
return train, test
count += 1
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42):
self.name = name
self.X = X
self.y = y
self.task = task
self.random_state = random_state
if test_size is not None:
self.test_size = test_size
self.validation_method = "train_test_split"
self.X_train, self.X_test, self.y_train, self.y_test = \
model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
elif cv is not None:
self.validation_method = "cv"
if task == "regression":
self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state)
elif task == "classification":
self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
def _sfn(l, mask, myrad, bcast_var):
"""Score classifier on searchlight data using cross-validation.
The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The
number of cross-validation folds is in `bast_var[1].
"""
clf = bcast_var[2]
data = l[0][mask, :].T
# print(l[0].shape, mask.shape, data.shape)
skf = model_selection.StratifiedKFold(n_splits=bcast_var[1],
shuffle=False)
accuracy = np.mean(model_selection.cross_val_score(clf, data,
y=bcast_var[0],
cv=skf,
n_jobs=1))
return accuracy
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj):
# NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel
# when the kernel matrix is computed in portions; also, this method only works
# for self-correlation, i.e. correlation between the same data matrix.
# no shrinking, set C=1
svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1)
#logit_clf = LogisticRegression()
clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
# doing leave-one-subject-out cross validation
# no shuffling in cv
skf = model_selection.StratifiedKFold(n_splits=num_subjects,
shuffle=False)
scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)),
y=labels,
cv=skf)
print(scores)
logger.info(
'the overall cross validation accuracy is %.2f' %
np.mean(scores)
)
def setBestParameters(self):
cv = StratifiedKFold(n_splits = self.conf.num_folds)
param_grid = self.conf.getParamGrid()
if param_grid is None:
# No parameter value to select
return
if self.conf.families_supervision:
scoring = 'f1_macro'
else:
scoring = 'roc_auc'
grid_search = GridSearchCV(self.pipeline, param_grid = param_grid,
scoring = scoring,
cv = cv,
n_jobs = -1,
fit_params = {'model__sample_weight': self.datasets.sample_weight})
grid_search.fit(self.datasets.train_instances.getFeatures(),
self.getSupervision(self.datasets.train_instances))
self.conf.setBestValues(grid_search)
self.pipeline.set_params(**self.conf.getBestValues())
return cv
def test_mdr_sklearn_pipeline():
"""Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
assert np.mean(cv_scores) > 0.
def test_mdr_sklearn_pipeline_parallel():
"""Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
assert np.mean(cv_scores) > 0.
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
"""K-Folds cross validation iterator.
Parameters
----------
k : int, default 5
stratify : bool, default False
shuffle : bool, default True
seed : int, default 33
Yields
-------
X_train, y_train, X_test, y_test, train_index, test_index
"""
if stratify:
kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
else:
kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)
for train_index, test_index in kf.split(self.X_train, self.y_train):
X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
yield X_train, y_train, X_test, y_test, train_index, test_index
def predict_training(self, folds=5):
"""Do cross-validation and return probabilities for each data-point.
Args:
folds (int): Number of folds used for prediction on training data.
"""
partial_clf = linear_model.LogisticRegression(class_weight='balanced')
prediction = np.zeros((len(self.features), self.num_classes))
skf = StratifiedKFold(n_splits=folds)
for train_index, test_index in skf.split(self.features, self.labels):
# prepare the training and test data
training_features = self.features[train_index]
test_features = self.features[test_index]
training_labels = self.labels[train_index]
# fitting the model and predicting
partial_clf.fit(training_features, training_labels)
curr_pred = partial_clf.predict_proba(test_features)
prediction[test_index] = \
self.predict_proba_ordered(curr_pred, partial_clf.classes_)
return prediction
def predict_training(self, folds=5):
"""Do cross-validation and return probabilities for each data-point.
Args:
folds (int): Number of folds used for prediction on training data.
"""
partial_clf = linear_model.LogisticRegression(class_weight='balanced')
prediction = np.zeros((len(self.features), self.num_classes))
skf = StratifiedKFold(n_splits=folds)
for train_index, test_index in skf.split(self.features, self.labels):
# prepare the training and test data
training_features = self.features[train_index]
test_features = self.features[test_index]
training_labels = self.labels[train_index]
# fitting the model and predicting
partial_clf.fit(training_features, training_labels)
curr_pred = partial_clf.predict_proba(test_features)
prediction[test_index] = \
self.predict_proba_ordered(curr_pred, partial_clf.classes_)
return prediction
def predict_training(self, folds=5):
"""Do cross-validation and return probabilities for each data-point.
Args:
folds (int): Number of folds used for prediction on training data.
"""
prediction = np.zeros((len(self.strings), self.num_classes))
skf = StratifiedKFold(n_splits=folds)
for train_index, test_index in skf.split(self.strings, self.labels):
# prepare the training and test data
training_strings = self.strings[train_index]
test_strings = self.strings[test_index]
training_labels = self.labels[train_index]
# predicting the results
part_prediction = self.find_knn(training_strings, training_labels,
test_strings)
prediction[test_index] = part_prediction
return prediction
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def test_stratified_kfold_ratios():
# Check that stratified kfold preserves class ratios in individual splits
# Repeat with shuffling turned off and on
n_samples = 1000
X = np.ones(n_samples)
y = np.array([4] * int(0.10 * n_samples) +
[0] * int(0.89 * n_samples) +
[1] * int(0.01 * n_samples))
for shuffle in (False, True):
for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y):
assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2)
assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2)
assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2)
assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2)
assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2)
assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2)
def test_stratifiedkfold_balance():
# Check that KFold returns folds with balanced sizes (only when
# stratification is possible)
# Repeat with shuffling turned off and on
X = np.ones(17)
y = [0] * 3 + [1] * 14
for shuffle in (True, False):
cv = StratifiedKFold(3, shuffle=shuffle)
for i in range(11, 17):
skf = cv.split(X[:i], y[:i])
sizes = []
for _, test in skf:
sizes.append(len(test))
assert_true((np.max(sizes) - np.min(sizes)) <= 1)
assert_equal(np.sum(sizes), i)
def transform(self, M, **kwargs):
"""
Takes a Takes a dataframe that has :code:`item_id` index, other
'features' columns for prediction, and applies a Keras sequential
model to it.
:param M:
a dataframe that has an :code:`item_id` index, and
"features" columns.
:type M: pandas.DataFrame
:rtype: a tuple with trained Keras model and its keyword
arguments
"""
rows, columns = M.shape
factors = M.merge(self.validation_matrix, left_index=True,
right_index=True)
factors = factors.values
if self.classification:
kfold = StratifiedKFold(n_splits=self.kfold_n_splits,
random_state=self.kfold_seed,
shuffle=self.kfold_shuffle)
else:
kfold = KFold(n_splits=self.kfold_n_splits,
random_state=self.kfold_seed,
shuffle=self.kfold_shuffle)
X = factors[:, :columns]
Y = factors[:, columns:]
for train_index, test_index in kfold.split(X, Y):
self.keras_model.fit(
X[train_index], Y[train_index],
validation_data=[X[test_index], Y[train_index]],
**self.keras_kwargs)
return self.keras_model, kwargs
inbreast.py 文件源码
项目:deep-mil-for-whole-mammogram-classification
作者: wentaozhu
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def cvsplitenhance(fold, totalfold, mydict, valfold=-1):
'''get the split of train and test
fold is the returned fold th data, from 0 to totalfold-1
total fold is for the cross validation
mydict is the return dict from readlabel
sperate the data into train, validation, test'''
skf = StratifiedKFold(n_splits=totalfold) # default shuffle is false, okay!
#readdicom(mydict)
y = mydict.values()
x = mydict.keys()
count = 0
if valfold == -1:
valfold = (fold+1) % totalfold
print('valfold'+str(valfold))
trainls, valls, testls = [], [], []
for train, test in skf.split(x,y):
print(len(train), len(test))
if count == fold:
#print test[:]
testls = test[:]
elif count == valfold:
valls = test[:]
else:
for i in test:
trainls.append(i)
count += 1
return trainls, valls, testls
def k_fold_validation(model, monitored_data, unmonitored_data, k, random_state=123):
"""
Performs k fold validation on a model. During each fold, records all of the scoring in the `scoring_methods` module.
@param model is a machine learning model that has the functions `fit(X, y)` and `predict(X)`
@param monitored_data an array-like matrix that has the following structure `[(features, value)]`
@param unmonitored_data is also an array-like object: [features]
@param k is the amount of folds
@return is a 2D array of scores, with the following structure `[{scoring_method: score}]` where the shape is `len(k)`
"""
X, y = get_X_y(monitored_data, unmonitored_data)
skf = StratifiedKFold(n_splits=k, random_state=random_state, shuffle=True)
evaluations = []
i = 1
for train, test in skf.split(X, y):
print("Starting split {}".format(i))
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
print("Fitting data")
model.fit(X_train, y_train)
print("Predicting")
prediction = model.predict(X_test)
evaluations.append(scoring_methods.evaluate_model(prediction, y_test))
print(evaluations[-1])
i += 1
return evaluations
def _cross_validation_for_one_voxel(clf, vid, num_folds, subject_data, labels):
"""Score classifier on data using cross validation."""
# no shuffling in cv
skf = model_selection.StratifiedKFold(n_splits=num_folds,
shuffle=False)
scores = model_selection.cross_val_score(clf, subject_data,
y=labels,
cv=skf, n_jobs=1)
logger.debug(
'cross validation for voxel %d is done' %
vid
)
return (vid, scores.mean())
def split_kfold_c(y):
skf = StratifiedKFold(5)
ilst = []
for tri, tei in skf.split(np.zeros(len(y)), y):
ilst.append((tri, tei))
return ilst
def get_cv_method(method, **kwargs):
if method == 'kfold':
return KFold(**kwargs)
elif method == 'skfold':
return StratifiedKFold(**kwargs)
elif method == 'loo':
return LeaveOneOut()
elif method == 'shuffle_split':
return ShuffleSplit(**kwargs)
elif method == 'split':
return TrainTestSplit(**kwargs)
elif method == 's_shuffle_split':
return StratifiedShuffleSplit(**kwargs)
elif method == 'time_series':
return TimeSeriesSplit(**kwargs)
else:
raise AttributeError('Invalid CV method - %s!' % method)
def computeAccuracyForSingleModel(self,algorithm="SVM",isLocalSmall=0,execType="normal"):
totalFeatures = self.instancesFeatures.shape[1]
n = min(5, totalFeatures/2) # as explained in the article, the number of local agents will be 5
numberOfFeaturesInEachModel = int( math.ceil (totalFeatures / n) )
if (isLocalSmall):
instFeatures = dataPreparation.selectNRandomColumns(self.instancesFeatures,numberOfFeaturesInEachModel)
#select random numberOfFeatures columns
else:
instFeatures = np.array(self.instancesFeatures)
skf = StratifiedKFold(n_splits=self.kFolds)
avgScore = 0
avgF1Macro = 0
avgF1Micro = 0
avgF1Weighted = 0
for train_index, test_index in skf.split(instFeatures, self.instancesClasses):
resultClasses = classifier.MakeClassification(self.algorithmsIndex[algorithm],instFeatures[train_index],self.instancesClasses[train_index],instFeatures[test_index],"value")
valF1Macro = f1_score(self.instancesClasses[test_index], resultClasses, average='macro')
valF1Micro = f1_score(self.instancesClasses[test_index], resultClasses, average='micro')
valF1Weighted = f1_score(self.instancesClasses[test_index], resultClasses, average='weighted')
valScore = accuracy_score(self.instancesClasses[test_index],resultClasses)
avgF1Macro += valF1Macro
avgF1Micro += valF1Micro
avgF1Weighted += valF1Weighted
avgScore += valScore
with open(self.fileToWrite, "a") as myfile:
myfile.write(str(valF1Weighted)+"\t"+str(valF1Micro)+"\t"+str(valF1Macro)+"\t"+str(valScore)+"\n")
avgScore = avgScore / self.kFolds
avgF1Macro /= self.kFolds
avgF1Weighted /= self.kFolds
avgF1Micro /= self.kFolds
return avgScore, avgF1Macro, avgF1Micro, avgF1Weighted
# this function will call all the underlying methods in order to perform data prepation, classification in each simulated agent, and aggregation