def parse(self, response):
"""
???html??????url ?????url??????
?????url???? /question/xxx ?????????????
"""
all_urls = response.css("a::attr(href)").extract()
all_urls = [parse.urljoin(response.url, url) for url in all_urls]
# ??lambda???????url????????true???????false???
all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
for url in all_urls:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)
if match_obj:
# ?????question???????????????????
request_url = match_obj.group(1)
yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
#??
# break
else:
# pass
# ????question??????????
yield scrapy.Request(url, headers=self.headers, callback=self.parse)
评论列表
文章目录