index_spider.py 文件源码

python
阅读 25 收藏 0 点赞 0 评论 0

项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码
def parse_search(self, response):
        """
        @summary: ?????????????request???????
        @param response:start_requests()?????????????
        """
        # ???????????????????????"antispider"??
        # ????"antispider"???????????????????????????
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200) # ??????????????
            raise CloseSpider('antispider')
        # ext????????????????json????url???????????????
        ext = response.xpath(
            '//div[@class="wx-rb bg-blue wx-rb_v1 _item"][1]/@href').extract() # ?????????????????????????????ext??
        if not ext:
            spider_logger.error("Faild searching {0} !".format(response.meta['query']))
            return
        # ???????json???url?????????10?????????????1?(page=1????)?url
        json_url = "".join(ext).replace('/gzh?','http://weixin.sogou.com/gzhjs?')+'&cb=sogou.weixin_gzhcb&page=1&gzhArtKeyWord='
        cookies = response.meta['cookies']
        yield Request(json_url, callback= self.parse_index, cookies=cookies, meta ={'cookies':cookies})
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号