def topic_parse(self, response):
if response.status in [400, 403, 302]:
response.request.meta["change_proxy"] = True
print ("?????????{url}".format(url=response.request.headers["Referer"]))
pass
else:
# ???????????
json_object = json.loads(response.body_as_unicode())
json_content = ''.join(json_object['msg'])
pattern = re.compile('<strong>(.*?)</strong>')
subtopic_names = re.findall(pattern,json_content)
pattern = re.compile('<p>(.*?)</p>')
subtopic_descriptions = re.findall(pattern,json_content)
pattern = re.compile('<a target="_blank" href="([^"]*)".*?>')
subtopic_urls = re.findall(pattern,json_content)
pattern = re.compile('<img src="(.*?)" alt=')
subtopic_pics = re.findall(pattern,json_content)
print("subtopic: %s"%len(subtopic_names))
# for i in range(2):
for i in range(len(subtopic_names)):
base_url = "https://www.zhihu.com" + subtopic_urls[i]
yield Request(
# url = base_url + "/top-answers",
url=base_url + "/top-answers?page=3",
# headers = self.set_headers3(base_url + "/hot"),
headers=self.set_headers(base_url + "/hot"),
cookies = cookielib.LWPCookieJar(filename='cookies'),
callback = self.top_answers_parse,
)
# ????????????????
评论列表
文章目录