def generate_folds(labels_fname, folds_fname, max_n_folds=10):
"""
Generate folds for CV exps with n = 2, ..., max_n_folds.
Save as pickled dict with n as key.
"""
filenames = read_labels(labels_fname)['__filenames__']
folds = {}
for n in range(2, max_n_folds + 1):
# Create folds from complete texts only
# (i.e. instances/sentences of the same text are never in different folds).
# There is no random seed, because the partitioning algorithm is deterministic.
group_k_fold = GroupKFold(n_splits=n)
# Don't bother to pass real X and Y, because they are not really used.
folds[n] = list(group_k_fold.split(filenames, filenames, filenames))
print('writing folds to ' + folds_fname)
pickle.dump(folds, open(folds_fname, 'wb'))
python类GroupKFold()的实例源码
def get_train_test_fold_filenames(true_iob_dir, use_pickle=True):
pickle_fname = '_train_test_fold_fnames.pkl'
if use_pickle:
try:
return pickle.load(open(pickle_fname, 'rb'))
except IOError:
pass
# Misuse data collecting function to get X, y and filenames.
# Since we are not interested in the actual features, we pretend true_iob_dir is a feature dir.
data = collect_crf_data(true_iob_dir, true_iob_dir)
# Now create
group_k_fold = GroupKFold(n_splits=5)
# Create folds from complete texts only (i.e. instances of the same text are never in different folds)
# Use same split for all three entities.
# Note that there is no random seed, because the output of group_k_fold.split is deterministic
# as long as the iob files are globbed in exactly the same order
splits = group_k_fold.split(data['feats'], data['Material'], data['filenames'])
fnames = np.array(data['filenames'])
train_test_fold_fnames = []
for train_idx, test_idx in splits:
train_fnames = np.unique(fnames[train_idx])
test_fnames = np.unique(fnames[test_idx])
train_test_fold_fnames.append((train_fnames, test_fnames))
pickle.dump(train_test_fold_fnames, open(pickle_fname, 'wb'))
return train_test_fold_fnames
def build_folds(all_xs, all_ys, advice):
domains = [get_domain(doc['url']) for doc in all_xs]
n_domains = len(set(domains))
n_relevant_domains = len(
{domain for domain, is_relevant in zip(domains, all_ys) if is_relevant})
n_folds = 4
if n_relevant_domains == 1:
advice.append(AdviceItem(
WARNING,
'Only 1 relevant domain in data means that it\'s impossible to do '
'cross-validation across domains, '
'and will likely result in model over-fitting.'
))
folds = KFold(n_splits=n_folds).split(all_xs)
else:
folds = (GroupKFold(n_splits=min(n_domains, n_folds))
.split(all_xs, groups=domains))
if 1 < n_relevant_domains < WARN_N_RELEVANT_DOMAINS:
advice.append(AdviceItem(
WARNING,
'Low number of relevant domains (just {}) '
'might result in model over-fitting.'.format(n_relevant_domains)
))
folds = two_class_folds(folds, all_ys)
if not folds:
folds = two_class_folds(KFold(n_splits=n_folds).split(all_xs), all_ys)
if not folds:
advice.append(AdviceItem(
WARNING,
'Can not do cross-validation, as there are no folds where '
'training data has both relevant and non-relevant examples. '
'There are too few domains or the dataset is too unbalanced.'
))
return folds
def train_models(data, targets, groups,model=None, cropsize=2800, batch_size=512, epochs=250, epochs_to_stop=15,rnn_epochs_to_stop=15):
"""
trains a cnn3adam_filter_l2 model with a LSTM on top on
the given data with 20% validation set and returns the two models
"""
input_shape = list((np.array(data[0])).shape) #train_data.shape
input_shape[0] = cropsize
n_classes = targets.shape[1]
train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__()
train_data = [data[i] for i in train_idx]
train_target = targets[train_idx]
train_groups = groups[train_idx]
val_data = [data[i] for i in val_idx]
val_target = targets[val_idx]
val_groups = groups[val_idx]
model = models.cnn3adam_filter_l2(input_shape, n_classes) if model is None else model(input_shape, n_classes)
g_train= generator(train_data, train_target, batch_size, val=False, cropsize=cropsize)
g_val = generator(val_data, val_target, batch_size, val=True, cropsize=cropsize)
cb = Checkpoint_balanced(g_val, verbose=1, groups=val_groups,
epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(model.name, 'testing'))
model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0)
val_acc = cb.best_acc
val_f1 = cb.best_f1
print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100))
# LSTM training
rnn_modelfun = models.pure_rnn_do
lname = 'fc1'
seq = 6
rnn_epochs = epochs
stopafter_rnn = rnn_epochs_to_stop
features = get_activations(model, train_data + val_data, lname, batch_size*2, cropsize=cropsize)
train_data_extracted = features[0:len(train_data)]
val_data_extracted = features[len(train_data):]
assert (len(train_data)==len(train_data_extracted)) and (len(val_data)==len(val_data_extracted))
train_data_seq, train_target_seq, train_groups_seq = tools.to_sequences(train_data_extracted, train_target,groups=train_groups, seqlen=seq)
val_data_seq, val_target_seq, val_groups_seq = tools.to_sequences(val_data_extracted, val_target, groups=val_groups, seqlen=seq)
rnn_shape = list((np.array(train_data_seq[0])).shape)
neurons = int(np.sqrt(rnn_shape[-1])*4)
rnn_model = rnn_modelfun(rnn_shape, n_classes, layers=2, neurons=neurons, dropout=0.3)
print('Starting RNN model with input from layer fc1: {} at {}'.format(rnn_model.name, rnn_shape, time.ctime()))
g_train= generator(train_data_seq, train_target_seq, batch_size, val=False)
g_val = generator(val_data_seq, val_target_seq, batch_size, val=True)
cb = Checkpoint_balanced(g_val, verbose=1, groups = val_groups_seq,
epochs_to_stop=stopafter_rnn, plot = True, name = '{}, {}'.format(rnn_model.name,'fc1'))
rnn_model.fit_generator(g_train, g_train.n_batches, epochs=rnn_epochs, verbose=0, callbacks=[cb],max_queue_size=1)
val_acc = cb.best_acc
val_f1 = cb.best_f1
print('LSTM Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100))
return model, rnn_model
def train_models_feat(data, targets, groups, batch_size=512, epochs=250, epochs_to_stop=15):
"""
trains a ann and rnn model with features
the given data with 20% validation set and returns the two models
"""
batch_size = 512
input_shape = list((np.array(data[0])).shape) #train_data.shape
n_classes = targets.shape[1]
train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__()
train_data = [data[i] for i in train_idx]
train_target = targets[train_idx]
train_groups = groups[train_idx]
val_data = [data[i] for i in val_idx]
val_target = targets[val_idx]
val_groups = groups[val_idx]
model = models.ann(input_shape, n_classes)
g_train= generator(train_data, train_target, batch_size, val=False)
g_val = generator(val_data, val_target, batch_size, val=True)
cb = Checkpoint_balanced(g_val, verbose=1, groups=val_groups,
epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(model.name, 'testing'))
model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0)
val_acc = cb.best_acc
val_f1 = cb.best_f1
print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100))
# LSTM training
batch_size = 512
n_classes = targets.shape[1]
train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__()
train_data = np.array([data[i] for i in train_idx])
train_target = targets[train_idx]
train_groups = groups[train_idx]
val_data = np.array([data[i] for i in val_idx])
val_target = targets[val_idx]
val_groups = groups[val_idx]
train_data_seq, train_target_seq, train_groups_seq = tools.to_sequences(train_data, train_target, groups=train_groups, seqlen=6)
val_data_seq, val_target_seq, val_groups_seq = tools.to_sequences(val_data, val_target, groups=val_groups, seqlen=6)
input_shape = list((np.array(train_data_seq[0])).shape) #train_data.shape
print(input_shape)
rnn_model = models.pure_rnn_do(input_shape, n_classes)
g_train = generator(train_data_seq, train_target_seq, batch_size, val=False)
g_val = generator(val_data_seq, val_target_seq, batch_size, val=True)
cb = Checkpoint_balanced(g_val, verbose=1, groups=val_groups_seq,
epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(rnn_model.name, 'testing'))
rnn_model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0)
val_acc = cb.best_acc
val_f1 = cb.best_f1
print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100))
return model, rnn_model