def process_response(self, request, response, spider):
#???????response
http_code = response.status
if http_code // 100 == 2:
self.stats.inc_value('response/%d'%http_code, spider=spider)
return response
#???????304????3??????
if http_code // 100 == 3 and http_code != 304:
self.stats.inc_value('response/%d'%http_code, spider=spider)
#??????url
url = response.headers['location']
domain = urlparse.urlparse(url).netloc
#??????url?domain???allowed_domains?
if domain in spider.allowed_domains:
return Request(url=url, meta=request.meta)
else:
raise IgnoreRequest(u'not allowed to crawl')
if http_code // 100 == 4 and http_code != 403:
self.stats.inc_value('response/%d'%http_code, spider=spider)
#????403????????????
raise IgnoreRequest(u'404')
if http_code // 100 == 5:
self.stats.inc_value('response/%d'%http_code, spider=spider)
return request
#????meta refresh???
url = html.get_html_meta_refresh(response)
if url:
self.stats.inc_value('response/metarefresh', spider=spider)
domain = urlparse.urlparse(url).netloc
#??meta refresh????url?domain???allowed_domains?
if domain in spider.allowed_domains:
return Request(url=url, meta=request.meta)
downloadermiddlewares.py 文件源码
python
阅读 18
收藏 0
点赞 0
评论 0
评论列表
文章目录