def update_warc_info_from_spider(record, spider):
"""update a WARC warcinfo record from a scrapy Spider"""
# make empty header object to use for fields
# XXX WARCHeader messes up capitalization here
fields = warc.WARCHeader({}, defaults=False)
fields['software'] = 'osp_scraper'
fields['hostname'] = socket.getfqdn()
fields['x-spider-name'] = spider.name
fields['x-spider-run-id'] = spider.run_id
fields['x-spider-revision'] = git_revision
fields['x-spider-parameters'] = json.dumps(spider.get_parameters())
buf = BytesIO()
fields.write_to(buf, version_line=False, extra_crlf=False)
record.update_payload(buf.getvalue())
评论列表
文章目录