index_spider.py 文件源码

python
阅读 31 收藏 0 点赞 0 评论 0

项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码
def parse_index(self, response):
        """
        @summary: ?????????????????Request??
        @param response: parse_search()?????????????
        @return: list????????????url???????????
        """
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200)
            raise CloseSpider('antispider')
        requests = []
        page_list = self._get_result(response)
        # ???????????????
        if not page_list:
            return requests
        next_page = True  # ????????
        # ???????????????
        for item in page_list:
            if isinstance(item, Request):  # ?????Request
                requests.append(item)
                next_page = False
                break
            if item['publish_time'] <= self.from_time:  # ????????self.from_time
                next_page = False
                break
            elif item['publish_time'] > self.end_time:  # ????????self.end_time
                continue
            else:
                req = Request(item['url'], self.parse_page)
                # ???????
                req.meta["item"] = item
                requests.append(req)
        # ?????,??????Request;???????
        if next_page and self._next_result_page(response):
            cookies = response.meta['cookies']
            requests.append(Request(self._next_result_page(response),callback=self.parse_index,cookies=cookies, meta ={'cookies':cookies}))
        return requests
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号