def process_response(self, request, response, spider):
meta = request.meta
# parse CDX requests and schedule future snapshot requests
if meta.get('wayback_machine_cdx_request'):
snapshot_requests = self.build_snapshot_requests(response, meta)
# treat empty listings as 404s
if len(snapshot_requests) < 1:
return Response(meta['wayback_machine_original_request'].url, status=404)
# schedule all of the snapshots
for snapshot_request in snapshot_requests:
self.crawler.engine.schedule(snapshot_request, spider)
# abort this request
raise UnhandledIgnoreRequest
# clean up snapshot responses
if meta.get('wayback_machine_url'):
return response.replace(url=meta['wayback_machine_original_request'].url)
return response
评论列表
文章目录