def label_by_dir(self, file_path, target_dir, dir_and_label, task_size=10):
label_dirs = dir_and_label.keys()
dirs = [d for d in os.listdir(target_dir)
if os.path.isdir(os.path.join(target_dir, d))
and d in label_dirs]
write_flg = True
for d in dirs:
self.logger.info(
"Extracting {} (labeled by {}).".format(d, dir_and_label[d]))
label = dir_and_label[d]
dir_path = os.path.join(target_dir, d)
pathes = [os.path.join(dir_path, f) for f in os.listdir(dir_path)]
pathes = [p for p in pathes if os.path.isfile(p)]
task_length = int(math.ceil(len(pathes) / task_size))
for i in xtqdm(range(task_length)):
index = i * task_size
tasks = pathes[index:(index + task_size)]
lines = Parallel(n_jobs=-1)(
delayed(self._make_pair)(label, t) for t in tasks)
mode = "w" if write_flg else "a"
with open(file_path, mode=mode, encoding="utf-8") as f:
for ln in lines:
f.write(ln)
write_flg = False
评论列表
文章目录