pipelines.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:osp-scraper 作者: opensyllabus 项目源码 文件源码
def update_warc_response_from_item(record, item):
    """update a WARC response record from a scrapy Item"""
    h = record.header
    h['WARC-Target-URI'] = item['url']
    h['WARC-Date'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(item['retrieved']))
    h['X-Spider-Name'] = item['spider_name']
    h['X-Spider-Run-ID'] = item['spider_run_id']
    # XXX Scrapy doesn't provide remote IP for WARC-IP-Address

    # below based on WARCRecord.from_response()

    # XXX scrapy doesn't provide human-readable status string
    status = "HTTP/1.1 {} {}".format(item['status'],
                                     http.HTTPStatus(item['status']).name).encode()
    headers = [b': '.join((k, v)) for k, l in item['headers'].iteritems() for v in l]

    record.update_payload(b"\r\n".join(itertools.chain((status, ),
                                                       headers,
                                                       (b'', ),
                                                       (item['content'], )
                                                       )))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号