def main(in_dir, out_dir, n_process=int(multiprocessing.cpu_count() * .75), n_thread=4, batch_size=10000):
# Create the output directory, if it doesn't exist
if not path.exists(out_dir):
makedirs(out_dir)
# Get total number of input files for tracking progress
total_files = len(list(iter_dir(in_dir)))
# For each input file
for i, file in enumerate(iter_dir(in_dir)):
# Print progress
print('Tagging file %s of %s' % (i + 1, total_files))
# If multiprocessing
if n_process >= 2:
# Split up text in the input file
texts = partition_all(100000, iter_lines(file))
# Parallelize the job
parallelize(save_parses, enumerate(texts),
n_process, [out_dir, n_thread, batch_size],
backend='multiprocessing')
# If not multiprocessing
else:
save_parses(0, iter_lines(file), out_dir, n_thread, batch_size)
评论列表
文章目录