def update_warc_response_from_item(record, item):
"""update a WARC response record from a scrapy Item"""
h = record.header
h['WARC-Target-URI'] = item['url']
h['WARC-Date'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(item['retrieved']))
h['X-Spider-Name'] = item['spider_name']
h['X-Spider-Run-ID'] = item['spider_run_id']
# XXX Scrapy doesn't provide remote IP for WARC-IP-Address
# below based on WARCRecord.from_response()
# XXX scrapy doesn't provide human-readable status string
status = "HTTP/1.1 {} {}".format(item['status'],
http.HTTPStatus(item['status']).name).encode()
headers = [b': '.join((k, v)) for k, l in item['headers'].iteritems() for v in l]
record.update_payload(b"\r\n".join(itertools.chain((status, ),
headers,
(b'', ),
(item['content'], )
)))
评论列表
文章目录