spider_7_quotes_js2xml.py 文件源码-python代码片段

spider_7_quotes_js2xml.py 文件源码

python

阅读 21 收藏 0 点赞 0 评论 0

项目：scrapy-training 作者: scrapinghub 项目源码文件源码

def alternative_parse_method(self, response):
        # An alternative would be to build a Scrapy selector from the JS string
        # and extract the data using CSS selectors
        script = response.xpath('//script[contains(., "var data =")]/text()').extract_first()
        sel = scrapy.Selector(root=js2xml.parse(script))
        for quote in sel.css('var[name="data"] > array > object'):
            yield {
                'text': quote.css('property[name="text"] > string::text').extract_first(),
                'author': quote.css('property[name="author"] property[name="name"] > string::text').extract_first(),
                'tags': quote.css('property[name="tags"] string::text').extract(),
            }

        link_next = response.css('li.next a::attr("href")').extract_first()
        if link_next:
            yield scrapy.Request(response.urljoin(link_next))