def run_parallel(num_processes, out_dir, source):
page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION))
urls = pattern.findall(page.text)
del page
queue = Queue()
for url in urls:
queue.put(url)
ioutils.mkdir(out_dir + '/' + source + '/raw')
download_dir = out_dir + '/' + source + '/raw/'
ioutils.mkdir(download_dir)
procs = [Process(target=split_main, args=[i, queue, download_dir]) for i in range(num_processes)]
for p in procs:
p.start()
for p in procs:
p.join()
评论列表
文章目录