def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, XX)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf_n = kf5_ext_c.split( A_list[0])
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
python类KFold()的实例源码
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, X_concat)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
#kf_n = kf5_ext_c.split( A_list[0])
gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
kf_n = kf5_ext_c.split( xM)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
kutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
n_splits = xM.shape[0]
# print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits)
yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
kutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def gs_Lasso(xM, yV, alphas_log=(-1, 1, 9), n_folds=5, n_jobs=-1):
print(xM.shape, yV.shape)
clf = linear_model.Lasso()
#parmas = {'alpha': np.logspace(1, -1, 9)}
parmas = {'alpha': np.logspace(*alphas_log)}
kf5_c = model_selection.KFold(n_folds=n_folds, shuffle=True)
kf5 = kf5_c.split(xM)
gs = model_selection.GridSearchCV(
clf, parmas, scoring='r2', cv=kf5, n_jobs=n_jobs)
gs.fit(xM, yV)
return gs
def create_training_test_sets(self):
""" Split data set into training and test folds. """
# load input data
input_data = np.asarray(np.loadtxt('input/data.txt'), dtype=np.float32)
self.input_dim = input_data.shape[1] - 1
self.output_dim = 1
# align to batch size
batches = input_data.shape[0] // (self.batch_size * self.n_splits)
input_data = input_data[:batches * (self.batch_size * self.n_splits)]
self.data_size = input_data.shape[0]
print(f'Loaded input data, shape = {input_data.shape}')
# create splits
kfold = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
print(f'Splits: {self.n_splits}')
# assume y is in the last column by default
for idx_train, idx_test in kfold.split(input_data):
self.train_x.append(input_data[idx_train, :-1])
self.train_y.append(input_data[idx_train, -1:])
self.test_x.append(input_data[idx_test, :-1])
self.test_y.append(input_data[idx_test, -1:])
# layers described as [number of neurons, dropout probability]
if self.layers_description is None:
self.layers_description = [[self.input_dim, 0.0], [100, 0.0], [100, 0.0], [self.output_dim, 0.0]]
def transform(self, M, **kwargs):
"""
Takes a Takes a dataframe that has :code:`item_id` index, other
'features' columns for prediction, and applies a Keras sequential
model to it.
:param M:
a dataframe that has an :code:`item_id` index, and
"features" columns.
:type M: pandas.DataFrame
:rtype: a tuple with trained Keras model and its keyword
arguments
"""
rows, columns = M.shape
factors = M.merge(self.validation_matrix, left_index=True,
right_index=True)
factors = factors.values
if self.classification:
kfold = StratifiedKFold(n_splits=self.kfold_n_splits,
random_state=self.kfold_seed,
shuffle=self.kfold_shuffle)
else:
kfold = KFold(n_splits=self.kfold_n_splits,
random_state=self.kfold_seed,
shuffle=self.kfold_shuffle)
X = factors[:, :columns]
Y = factors[:, columns:]
for train_index, test_index in kfold.split(X, Y):
self.keras_model.fit(
X[train_index], Y[train_index],
validation_data=[X[test_index], Y[train_index]],
**self.keras_kwargs)
return self.keras_model, kwargs
def setup_data(self, path):
"""Read and iteratively yield data to agent"""
print('loading: ' + path)
questions = []
y = []
# open data file with labels
# (path will be provided to setup_data from opt['datafile'] defined above)
with open(path) as labels_file:
context = csv.reader(labels_file)
next(context)
for item in context:
label, text = item
questions.append(text)
y.append([self.answer_candidates[int(label)]])
episode_done = True
indexes = range(len(questions))
if self.datatype_strict != 'test':
random_state = random.getstate()
random.setstate(self.random_state)
kf_seed = random.randrange(500000)
kf = KFold(self.opt.get('bagging_folds_number'), shuffle=True,
random_state=kf_seed)
i = 0
for train_index, test_index in kf.split(questions):
indexes = train_index if self.datatype_strict == 'train' else test_index
if i >= self.opt.get('bagging_fold_index', 0):
break
self.random_state = random.getstate()
random.setstate(random_state)
# define iterator over all queries
for i in indexes:
# get current label, both as a digit and as a text
# yield tuple with information and episode_done? flag
yield (questions[i], y[i]), episode_done
def kf_worker(X_tr, Y_tr, mu_range, tr_idx, vld_idx, i, results):
"""Worker for parallel KFold implementation."""
betas = RLS_path(X_tr, Y_tr, mu_range)
results[i] = {'betas': betas, 'tr_idx': tr_idx, 'vld_idx': vld_idx}
def test_large_grid():
"""In this test, we purposely overfit a RandomForest to completely random data
in order to assert that the test error will far supercede the train error.
"""
if not SK18:
custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
else:
custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)
# define the pipe
pipe = Pipeline([
('scaler', SelectiveScaler()),
('pca', SelectivePCA(weight=True)),
('rf', RandomForestClassifier(random_state=42))
])
# define hyper parameters
hp = {
'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
'pca__whiten': [True, False],
'pca__weight': [True, False],
'pca__n_components': uniform(0.75, 0.15),
'rf__n_estimators': randint(5, 10),
'rf__max_depth': randint(5, 15)
}
# define the grid
grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)
# this will fail because we haven't fit yet
assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)
# fit the grid
grid.fit(X_train, y_train)
# score for coverage -- this might warn...
with warnings.catch_warnings():
warnings.simplefilter("ignore")
grid.score(X_train, y_train)
# coverage:
assert grid._estimator_type == 'classifier'
# get predictions
tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)
# evaluate score (SHOULD be better than random...)
accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)
# grid score reports:
# assert fails for bad percentile
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})
# assert fails for bad y_axis
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})
# assert passes otherwise
report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def adaBoost(self, settings, data=None, dropna=True):
df = self.__loadData(data, dropna)
features = df.columns[:-1]
X = df[features]
y = df.iloc[:, -1].values
seed = 7
num_trees = 500
kfold = model_selection.KFold(n_splits=10, random_state=seed)
print kfold
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
model.fit(X, y)
print results.mean()
print model.score(X, y)
return True
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio):
fn = cache_fname("linear_val_df", (dataset, k, link_alpha, prop_alpha,
l1_ratio))
if os.path.exists(fn):
logging.info("Loading {}".format(fn))
with open(fn, "rb") as f:
return dill.load(f)
ds = 'erule' if dataset == 'cdcp' else 'ukp-essays' # sorry
path = os.path.join("data", "process", ds, "folds", "{}", "{}")
# sorry again: get val docs
n_folds = 5 if dataset == 'ukp' else 3
load, ids = get_dataset_loader(dataset, "train")
for k_, (_, val) in enumerate(KFold(n_folds).split(ids)):
if k_ == k:
break
val_docs = list(load(ids[val]))
X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'),
return_y=True)
X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'),
return_y=True)
X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'),
return_y=True)
X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'),
return_y=True)
baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio)
baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop)
Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs)
with open(fn, "wb") as f:
logging.info("Saving {}".format(fn))
dill.dump((Y_marg, baseline), f)
return Y_marg, baseline
def linear_cv_score(dataset, alpha, l1_ratio, constraints):
fn = cache_fname("linear_cv_score", (dataset, alpha, l1_ratio,
constraints))
if os.path.exists(fn):
logging.info("Loading {}".format(fn))
with open(fn, "rb") as f:
return dill.load(f)
load, ids = get_dataset_loader(dataset, split="train")
n_folds = 5 if dataset == 'ukp' else 3
scores = []
for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio)
val_docs = list(load(ids[val]))
Y_true = [doc.label for doc in val_docs]
Y_pred = bl.fast_decode(Y_marg, val_docs, constraints)
scores.append(bl._score(Y_true, Y_pred))
with open(fn, "wb") as f:
logging.info("Saving {}".format(fn))
dill.dump(scores, f)
return scores
def svmstruct_cv_score(dataset, C, class_weight, constraints,
compat_features, second_order_features):
fn = cache_fname("svmstruct_cv_score", (dataset, C, class_weight,
constraints, compat_features,
second_order_features))
if os.path.exists(fn):
logging.info("Cached file already exists.")
with open(fn, "rb") as f:
return dill.load(f)
load, ids = get_dataset_loader(dataset, split="train")
n_folds = 5 if dataset == 'ukp' else 3
# below are boolean logical ops
grandparents = second_order_features and dataset == 'ukp'
coparents = second_order_features
siblings = second_order_features and dataset == 'cdcp'
scores = []
all_Y_pred = []
for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
train_docs = list(load(ids[tr]))
val_docs = list(load(ids[val]))
clf, Y_val, Y_pred = fit_predict(train_docs, val_docs, dataset, C,
class_weight,
constraints, compat_features,
second_order_features, grandparents,
coparents, siblings)
all_Y_pred.extend(Y_pred)
scores.append(clf.model._score(Y_val, Y_pred))
with open(fn, "wb") as f:
dill.dump((scores, all_Y_pred), f)
return scores, all_Y_pred
def split_kfold_r(y):
skf = KFold(5)
ilst = []
for tri, tei in skf.split(y):
ilst.append((tri, tei))
return ilst
def split_fold(in_pattern, rettrain=True, fold=0, cvs=5, include_vlaidation=True, split_seed=0):
"""
Splits the elements of the in_pattern into training and test sets
:param in_pattern: string of tfrecord patterns
:param rettrain: return training set (True) or leave out set (False)
:param fold: which fold to process
:param cvs: how many folds you want
:param include_vlaidation: include validation set
:return: subset of tfrecords
"""
assert fold < cvs
files = gfile.Glob(in_pattern)
if split_seed > 0:
kf = KFold(n_splits=cvs, shuffle=True, random_state=split_seed)
else:
kf = KFold(n_splits=cvs)
for i, (train, test) in enumerate(kf.split(files)):
if i == fold:
break
if rettrain:
retfiles = list(np.array(files)[train])
else:
retfiles = list(np.array(files)[test])
if include_vlaidation:
addition = [fname.replace('train', 'validate') for fname in retfiles]
retfiles += addition
return retfiles
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10):
assert(embeddings1.shape[0] == embeddings2.shape[0])
assert(embeddings1.shape[1] == embeddings2.shape[1])
nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
nrof_thresholds = len(thresholds)
k_fold = KFold(n_splits=nrof_folds, shuffle=False)
tprs = np.zeros((nrof_folds,nrof_thresholds))
fprs = np.zeros((nrof_folds,nrof_thresholds))
accuracy = np.zeros((nrof_folds))
diff = np.subtract(embeddings1, embeddings2)
dist = np.sum(np.square(diff),1)
indices = np.arange(nrof_pairs)
for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
# Find the best threshold for the fold
acc_train = np.zeros((nrof_thresholds))
for threshold_idx, threshold in enumerate(thresholds):
_, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
best_threshold_index = np.argmax(acc_train)
for threshold_idx, threshold in enumerate(thresholds):
tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
_, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
tpr = np.mean(tprs,0)
fpr = np.mean(fprs,0)
return tpr, fpr, accuracy
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
assert(embeddings1.shape[0] == embeddings2.shape[0])
assert(embeddings1.shape[1] == embeddings2.shape[1])
nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
nrof_thresholds = len(thresholds)
k_fold = KFold(n_splits=nrof_folds, shuffle=False)
val = np.zeros(nrof_folds)
far = np.zeros(nrof_folds)
diff = np.subtract(embeddings1, embeddings2)
dist = np.sum(np.square(diff),1)
indices = np.arange(nrof_pairs)
for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
# Find the threshold that gives FAR = far_target
far_train = np.zeros(nrof_thresholds)
for threshold_idx, threshold in enumerate(thresholds):
_, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
if np.max(far_train)>=far_target:
f = interpolate.interp1d(far_train, thresholds, kind='slinear')
threshold = f(far_target)
else:
threshold = 0.0
val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
val_mean = np.mean(val)
far_mean = np.mean(far)
val_std = np.std(val)
return val_mean, val_std, far_mean
def test_using_kfold(X, y, clf, splits=5):
kf = KFold(n_splits=splits, shuffle=True)
scores = []
for k, (train, test) in enumerate(kf.split(X, y)):
logger.info("Fitting and transforming the model on one fold")
clf.fit(X[train], y[train])
score = clf.score(X[test], y[test])
logger.info("[Fold {0}] score: {1:.5f}".format(k+1, score))
scores.append(score)
utils.persistence.dump(CLF_KFOLD_DUMP_NAME, clf)
scores_mean = np.mean(scores)
logger.info("Score: {}".format(scores_mean))
return clf
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, num_folds=10):
"""Calculate TPR and FPR under different threshold, accuracy under the best threshold"""
assert (embeddings1.shape[0] == embeddings2.shape[0])
assert (embeddings1.shape[1] == embeddings2.shape[1])
num_pairs = min(len(actual_issame), embeddings1.shape[0])
num_threshold = len(thresholds)
k_fold = KFold(n_splits=num_folds, shuffle=False)
tprs = np.zeros((num_folds, num_threshold))
fprs = np.zeros((num_folds, num_threshold))
acc = np.zeros((num_folds))
diff = np.subtract(embeddings1, embeddings2)
dist = np.sum(np.square(diff), 1)
indices = np.arange(num_pairs)
for fold_id, (train_set, test_set) in enumerate(k_fold.split(indices)):
# Find the best threshold
acc_train = np.zeros((num_threshold))
for thres_id, thres in enumerate(thresholds):
_, _, acc_train[thres_id] = calculate_acc(thres, dist[train_set], actual_issame[train_set])
best_id = np.argmax(acc_train)
# Calculate tprs and fprs on test set
for thres_id, thres in enumerate(thresholds):
tprs[fold_id, thres_id], fprs[fold_id, thres_id], _ = calculate_acc(thres, dist[test_set],
actual_issame[test_set])
# Use the best threshold to calculate accuracy
_, _, acc[fold_id] = calculate_acc(thresholds[best_id], dist[test_set], actual_issame[test_set])
tpr = np.mean(tprs, 0) # true positive rate under different threshold
fpr = np.mean(fprs, 0) # false positive rate under different threshold
return tpr, fpr, acc