commands.py 文件源码

python
阅读 31 收藏 0 点赞 0 评论 0

项目:dpla-service-hub 作者: KnowledgeLinks 项目源码 文件源码
def iterate_dc_xml(**kwargs):
    from bibcat.ingesters.ingester import new_graph
    import xml.etree.ElementTree as etree
    filepath = kwargs.get("in_file")
    ingester = kwargs.get("ingester")
    shard_size = kwargs.get("shard_size", -1)
    output_dir = kwargs.get("output_dir", 
        os.path.abspath(os.path.join(PROJECT_BASE, "output")))
    start = datetime.datetime.utcnow()
    click.echo("Starting DC XML at {} for records at {}".format(
        start,
        filepath))
    count = 0
    shard_template = "dc-{}k-{}k.ttl"
    if shard_size is not None and shard_size > 0:
        shard_name = shard_template.format(count, shard_size)
    shard_graph = new_graph()
    for event, elem in etree.iterparse(filepath):
        if event.startswith('end') and \
           elem.tag.endswith("Description"):
            ingester.transform(etree.tostring(elem))
            shard_graph += ingester.graph
            if not count%10 and count > 0:
                click.echo(".", nl=False)
                #! DEBUG code
                with open(os.path.join(output_dir, "dpl-dc-test.ttl"), "wb+") as fo:
                    fo.write(shard_graph.serialize(format='turtle'))
                break
            if not count%100:
                click.echo(count, nl=False)
            if shard_size is not None and shard_size > 0 and not count%shard_size:
                with open(os.path.join(output_dir, shard_name), 'wb+') as fo:
                    fo.write(shard_graph.serialize(format='turtle'))
                shard_graph = new_graph()
                shard_name = shard_template.format(count, count+shard_size)
            count += 1
    end = datetime.datetime.utcnow()
    click.echo("Finished DC ingestion at {} total time of {} mins for {}".format(
        end,
        (end-start).seconds / 60.0,
        count))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号