def iter_json_batches(inputs, base_url, batch_size, keep_instance_path):
parsed_base_url = urlparse(base_url)
current_uri = None
current_batch = []
for href, resource in inputs:
# Skip over links-only (discovery) resources
if resource.keys() == ["_links"]:
continue
# Inject the base URL's scheme and netloc; `urljoin` should do exactly this operation,
# but actually won't if the right-hand-side term defines its own netloc
parsed_href = urlparse(href)
uri = urlunparse(parsed_href._replace(
scheme=parsed_base_url.scheme,
netloc=parsed_base_url.netloc,
))
if batch_size == 1:
yield (uri, [resource])
else:
# batch handling
if keep_instance_path:
collection_uri = uri.rsplit("?", 1)[0]
else:
collection_uri = uri.rsplit("/", 1)[0]
if any((
current_uri is not None and current_uri != collection_uri,
len(current_batch) >= batch_size,
)):
yield (current_uri, current_batch)
current_batch = []
current_uri = collection_uri
current_batch.append(resource)
if current_batch:
yield (current_uri, current_batch)
评论列表
文章目录