zhihuspider0.py 文件源码-python代码片段

zhihuspider0.py 文件源码

python

阅读 22 收藏 0 点赞 0 评论 0

项目：ZhihuSpider 作者: AlexTan-b-z 项目源码文件源码

def parse_question(self,response):
        list_item = response.xpath('//div[@class="List-item"]')
        for one in list_item:
            item = QuestionItem()
            item['ask_user_id'] = response.meta['ask_user_id']
            title = one.xpath('.//div[@class="QuestionItem-title"]')
            item['title'] = title.xpath('./a/text()').extract()[0]
            item['question_id'] = title.xpath('./a/@href').extract()[0].replace('/question/','')
            content_item = one.xpath('.//div[@class="ContentItem-status"]//span/text()').extract()
            item['ask_time'] = content_item[0]
            item['answer_count'] = content_item[1]
            item['followees_count'] = content_item[2]
            yield item
        next_page = response.xpath('//button[@class="Button PaginationButton PaginationButton-next Button--plain"]/text()').extract()
        if next_page:
            response.meta['page'] += 1
            next_url = re.findall('(.*page=)\d+',response.url)[0] + str(response.meta['page'])
            yield Request(next_url,callback=self.parse_question,meta={'ask_user_id':response.meta['ask_user_id'],'page':response.meta['page']})