downloadermiddlewares.py 文件源码

python
阅读 18 收藏 0 点赞 0 评论 0

项目:scrapy_redis_spider 作者: lymlhhj123 项目源码 文件源码
def process_response(self, request, response, spider):
        #???????response
        http_code = response.status
        if http_code // 100 == 2:
            self.stats.inc_value('response/%d'%http_code, spider=spider)
            return response

        #???????304????3??????
        if http_code // 100 == 3 and http_code != 304:
            self.stats.inc_value('response/%d'%http_code, spider=spider)
            #??????url
            url = response.headers['location']
            domain = urlparse.urlparse(url).netloc
            #??????url?domain???allowed_domains?
            if domain in spider.allowed_domains:        
                return Request(url=url, meta=request.meta)
            else:
                raise IgnoreRequest(u'not allowed to crawl')

        if http_code // 100 == 4 and http_code != 403:
            self.stats.inc_value('response/%d'%http_code, spider=spider)
            #????403????????????
            raise IgnoreRequest(u'404')

        if http_code // 100 == 5:   
            self.stats.inc_value('response/%d'%http_code, spider=spider)                    
            return request

        #????meta refresh???        
        url = html.get_html_meta_refresh(response)
        if url:
            self.stats.inc_value('response/metarefresh', spider=spider)
            domain = urlparse.urlparse(url).netloc
            #??meta refresh????url?domain???allowed_domains?
            if domain in spider.allowed_domains:        
                return Request(url=url, meta=request.meta)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号