def convert_dataset(path, filemap, name, num_processes, max_num_support, max_tokens, is_web=True):
with open(path, 'rb') as f:
dataset = pickle.load(f)
if num_processes == 1:
instances = process((dataset, filemap, max_num_support, max_tokens, is_web), True)
else:
chunk_size = 1000
executor = ProcessPoolExecutor(num_processes)
instances = []
i = 0
for processed in executor.map(
process, [(dataset[i * chunk_size:(i + 1) * chunk_size], filemap, max_num_support, max_tokens, is_web)
for i in range(len(dataset) // chunk_size + 1)]):
instances.extend(processed)
i += chunk_size
print("%d/%d done" % (min(len(dataset), i), len(dataset)))
return {"meta": {"source": name}, 'instances': instances}
评论列表
文章目录