def top_answers_parse(self, response):
if response.body in ["banned", b"{'reason': b'Bad Request', 'status': 400}",
"{'reason': b'Bad Request', 'status': 400}",
]:
req = response.request
req.meta["change_proxy"] = True
yield req
else:
# ??topic???
# https://www.zhihu.com/topic/19551137/top-answers?page=2
# print response.url
end = response.url.rfind("/")
topic_id = int(response.url[28:end])
# print topic_id
# topic_id = int(response.url[28:-12])
topic_name_xpath_rule = '//h1[@class="zm-editable-content"]/text()'
topic_name = response.selector.xpath(topic_name_xpath_rule).extract_first()
topic_description_xpath_rule = '//div[@id="zh-topic-desc"]/div[@class="zm-editable-content"]/text()'
topic_description = response.selector.xpath(topic_description_xpath_rule).extract_first()
# print ("topic description")
# print (topic_description)
#?????
topicItem = TopicItem()
topicItem['type'] = 'topic'
topicItem['topic_id'] = topic_id
topicItem['topic_name'] = topic_name
topicItem['topic_description'] = topic_description
yield topicItem
answer_url_xpath_rule = '//div[@class="feed-item feed-item-hook folding"]/link/@href'
answer_urls_temp = response.selector.xpath(answer_url_xpath_rule).extract()
answer_urls = ["https://www.zhihu.com" + temp for temp in answer_urls_temp] #??????
for answer_url in answer_urls:
# print answer_url
yield Request(
url = answer_url,
# headers = self.set_headers3(None),
headers=self.set_headers(None),
cookies = cookielib.LWPCookieJar(filename='cookies'),
callback = self.answer_parse,
meta={'topic_id': topic_id}
)
# print ("?????????")
# answer_url = answer_urls[0]
# yield Request(
# url = answer_url,
# headers = self.set_headers(None),
# cookies = cookielib.LWPCookieJar(filename='cookies'),
# callback = self.answer_parse,
# )
评论列表
文章目录