def generate_dataset(self, split_dir, mode='training'):
if mode not in ['training', 'testing']:
raise ValueError("Mode must be 'training' or 'testing'")
do_augment = mode == 'training' # we only want to augment the training data
split_df = pd.DataFrame.from_csv(join(split_dir, '{}.csv'.format(mode))) # load splits
data_dir = make_sub_dir(split_dir, mode) # output directory for images
# Make directories for each class of images in advance
classes = [str(l) for l in split_df[self.label].unique()]
for class_name in classes:
make_sub_dir(data_dir, str(class_name))
# Pre-process, augment and randomly sample the training set
print "Preprocessing {} data...".format(mode)
if len(find_images(join(data_dir, '*'))) == 0:
pool = Pool(self.processes)
subprocess = partial(do_preprocess, args={'params': self, 'augment': do_augment, 'out_dir': data_dir})
img_list = list(split_df['full_path'])
_ = pool.map(subprocess, img_list)
self.generate_h5(find_images_by_class(data_dir, classes=classes), join(split_dir, '{}.h5'.format(mode)), split_df,
random_sample=True, classes=classes)
评论列表
文章目录