def iterate_dc_xml(**kwargs):
from bibcat.ingesters.ingester import new_graph
import xml.etree.ElementTree as etree
filepath = kwargs.get("in_file")
ingester = kwargs.get("ingester")
shard_size = kwargs.get("shard_size", -1)
output_dir = kwargs.get("output_dir",
os.path.abspath(os.path.join(PROJECT_BASE, "output")))
start = datetime.datetime.utcnow()
click.echo("Starting DC XML at {} for records at {}".format(
start,
filepath))
count = 0
shard_template = "dc-{}k-{}k.ttl"
if shard_size is not None and shard_size > 0:
shard_name = shard_template.format(count, shard_size)
shard_graph = new_graph()
for event, elem in etree.iterparse(filepath):
if event.startswith('end') and \
elem.tag.endswith("Description"):
ingester.transform(etree.tostring(elem))
shard_graph += ingester.graph
if not count%10 and count > 0:
click.echo(".", nl=False)
#! DEBUG code
with open(os.path.join(output_dir, "dpl-dc-test.ttl"), "wb+") as fo:
fo.write(shard_graph.serialize(format='turtle'))
break
if not count%100:
click.echo(count, nl=False)
if shard_size is not None and shard_size > 0 and not count%shard_size:
with open(os.path.join(output_dir, shard_name), 'wb+') as fo:
fo.write(shard_graph.serialize(format='turtle'))
shard_graph = new_graph()
shard_name = shard_template.format(count, count+shard_size)
count += 1
end = datetime.datetime.utcnow()
click.echo("Finished DC ingestion at {} total time of {} mins for {}".format(
end,
(end-start).seconds / 60.0,
count))
评论列表
文章目录