yfood.py 文件源码

python
阅读 17 收藏 0 点赞 0 评论 0

项目:YelpCrawlSpider 作者: yjp999 项目源码 文件源码
def parseBegin(self, response):
        if response.status ==503:
            raise CloseSpider("denied by remote server")
        sel = Selector(response)
        appends = response.meta['appends']
        cityName = appends['city']
        category = appends['cat']

        locations = self.getLocations(response.body)

        if locations == []:
            # self.logger.error("location is []: %s\t%s", response.url, str(cityName))
            return


        div_a = sel.xpath('//li[@class="regular-search-result"]/div/div[@class="biz-listing-large"]')
        for ii, div in enumerate(div_a):
            # pdb.set_trace()
            main = div.xpath('./div[1]/div/div[2]/h3/span/a[@class="biz-name"]')
            item = FoodItem()
            url = main.xpath('./@href').extract()
            item['url'] = response.urljoin(url[0])
            item['name'] = main.xpath('./span/text()').extract()[0]
            # pdb.set_trace()
            second = div.xpath('./div[2]')
            address = second.xpath('./address').extract()
            region = second.xpath('./span[@class="neighborhood-str-list"]/text()').extract()
            if address:
                item['address'] = self.filtertags(address[0])
            else:
                item['address'] = ""
            if region:
                item['region'] = (region[0]).strip()
            else:
                item['region'] = ""
            item['city'] = cityName.strip()
            item['category'] = category
            item['location'] = eval(locations[ii])
            yield item

        time.sleep(1.0)
        nextPage = sel.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href').extract()
        if nextPage:
            nextLink = response.urljoin(nextPage[0])
            yield Request(url=nextLink, callback=self.parseBegin, meta={'appends':appends}, dont_filter=True)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号