zhihu.py 文件源码

python
阅读 21 收藏 0 点赞 0 评论 0

项目:ZhihuSpider 作者: ShayChris 项目源码 文件源码
def parse_question(self, response):
        question_pattern = re.compile('(.*zhihu.com/question/(\d+))(/|$).*')
        match_object = re.match(question_pattern, response.url)
        question_id = match_object.group(2)
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_value('zhihu_id', question_id)
        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
        item_loader.add_css('topics', '.TopicLink .Popover div::text')
        item_loader.add_value('url', response.url)
        item_loader.add_css('content', '.QuestionHeader-detail div div span::text')
        item_loader.add_css('answer_num', '.List-headerText span::text')
        item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')
        item_loader.add_css('watch_user_num', '.NumberBoard-value::text')

        item = item_loader.load_item()
        yield item
        yield scrapy.Request(self.start_answer_url.format(question_id=question_id, offset=0, limit=20),
                             headers=self.headers, callback=self.parse_answer)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号