python类linkextractors()的实例源码

sp_douban.py 文件源码 项目:ScrapyProject 作者: chinesehuazhou 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def parse(self, response):
        item = DoubanTopMoviesItem()
        item['title_ch'] = response.xpath('//div[@class="hd"]//span[@class="title"][1]/text()').extract()

        # ???title-title-other ??3?????????title-other????????????????
        # en_list = response.xpath('//div[@class="hd"]//span[@class="title"][2]/text()').extract()
        # item['title_en'] = [en.replace('\xa0/\xa0','').replace('  ','') for en in en_list]
        # ht_list = response.xpath('//div[@class="hd"]//span[@class="other"]/text()').extract()
        # item['title_ht'] = [ht.replace('\xa0/\xa0','').replace('  ','') for ht in ht_list]
        # detail_list = response.xpath('//div[@class="bd"]/p[1]/text()').extract()
        # item['detail'] = [detail.replace('  ', '').replace('\xa0', '').replace('\n', '') for detail in detail_list]
        # ?????????quote??????????
        # item['quote'] = response.xpath('//span[@class="inq"]/text()').extract()

        item['rating_num'] = response.xpath('//div[@class="star"]/span[2]/text()').extract()
        # ??????“XXX???”???????????XXX??
        count_list = response.xpath('//div[@class="star"]/span[4]/text()').extract()
        item['rating_count'] = [re.findall('\d+',count)[0] for count in count_list]
        item['image_urls'] = response.xpath('//div[@class="pic"]/a/img/@src').extract()
        item['topid'] = response.xpath('//div[@class="pic"]/em/text()').extract()

        yield item

        # ???????????
        # new_url = response.xpath('//link[@rel="next"]/@href').extract_first()
        # if new_url:
        #     next_url = self.base_url+new_url
        #     yield scrapy.Request(next_url, callback=self.parse)


######-------??start_urls?LinkExtractor ???????--------#####
    # from scrapy.spiders import CrawlSpider, Rule
    # from scrapy.linkextractors import LinkExtractor
    # class SpDoubanSpider(CrawlSpider):
        # ?
    # ?????????????????
    # rules = [Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*')),
    #                callback='parse_item', follow=True)
    #           ]
    # def parse_item(self, response):
    #     # item ??????
    #     yield item
######-------??start_urls?LinkExtractor ???????--------#####


问题


面经


文章

微信
公众号

扫码关注公众号