jobbole.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:ArticleSpider 作者: mtianyan 项目源码 文件源码
def parse(self, response):
        """
                1. ???????????url???scrapy????????
                2. ??????url???scrapy????? ???????parse
                """
        # ???????????url???scrapy????????
        if response.status == 404:
            self.fail_urls.append(response.url)
            self.crawler.stats.inc_value("failed_url")
        #?extra?list????????
        post_nodes = response.css("#archive .floated-thumb .post-thumb a")
        for post_node in post_nodes:
            #??????url
            image_url = post_node.css("img::attr(src)").extract_first("")
            post_url = post_node.css("::attr(href)").extract_first("")
            #request?????????parse_detail??????????
            # Request(url=post_url,callback=self.parse_detail)
            yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": image_url}, callback=self.parse_detail)
            #??href?????????
            #response.url + post_url
            print(post_url)
        # ????????scrapy????
        next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号