def parse(self, response):
"""
1. ???????????url???scrapy????????
2. ??????url???scrapy????? ???????parse
"""
# ???????????url???scrapy????????
if response.status == 404:
self.fail_urls.append(response.url)
self.crawler.stats.inc_value("failed_url")
#?extra?list????????
post_nodes = response.css("#archive .floated-thumb .post-thumb a")
for post_node in post_nodes:
#??????url
image_url = post_node.css("img::attr(src)").extract_first("")
post_url = post_node.css("::attr(href)").extract_first("")
#request?????????parse_detail??????????
# Request(url=post_url,callback=self.parse_detail)
yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": image_url}, callback=self.parse_detail)
#??href?????????
#response.url + post_url
print(post_url)
# ????????scrapy????
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
评论列表
文章目录