spider.py 文件源码

python
阅读 36 收藏 0 点赞 0 评论 0

项目:zhihu_scrapy 作者: gxh123 项目源码 文件源码
def top_answers_parse(self, response):
        if response.body in ["banned", b"{'reason': b'Bad Request', 'status': 400}",
                             "{'reason': b'Bad Request', 'status': 400}",
                             ]:
            req = response.request
            req.meta["change_proxy"] = True
            yield req
        else:
            # ??topic???
            # https://www.zhihu.com/topic/19551137/top-answers?page=2
            # print response.url
            end = response.url.rfind("/")
            topic_id = int(response.url[28:end])
            # print topic_id
            # topic_id = int(response.url[28:-12])
            topic_name_xpath_rule = '//h1[@class="zm-editable-content"]/text()'
            topic_name = response.selector.xpath(topic_name_xpath_rule).extract_first()

            topic_description_xpath_rule = '//div[@id="zh-topic-desc"]/div[@class="zm-editable-content"]/text()'
            topic_description = response.selector.xpath(topic_description_xpath_rule).extract_first()
            # print ("topic description")
            # print (topic_description)
            #?????
            topicItem = TopicItem()
            topicItem['type'] = 'topic'
            topicItem['topic_id'] = topic_id
            topicItem['topic_name'] = topic_name
            topicItem['topic_description'] = topic_description
            yield topicItem

            answer_url_xpath_rule = '//div[@class="feed-item feed-item-hook folding"]/link/@href'
            answer_urls_temp = response.selector.xpath(answer_url_xpath_rule).extract()
            answer_urls = ["https://www.zhihu.com" + temp for temp in answer_urls_temp]   #??????

            for answer_url in answer_urls:
                # print answer_url
                yield Request(
                    url = answer_url,
                    # headers = self.set_headers3(None),
                    headers=self.set_headers(None),
                    cookies = cookielib.LWPCookieJar(filename='cookies'),
                    callback = self.answer_parse,
                    meta={'topic_id': topic_id}
                )
            # print ("?????????")
            # answer_url = answer_urls[0]
            # yield Request(
            #     url = answer_url,
            #     headers = self.set_headers(None),
            #     cookies = cookielib.LWPCookieJar(filename='cookies'),
            #     callback = self.answer_parse,
            # )
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号