def parse_index(self, response):
post_nodes = response.css('#warp .list15 li')
for post_node in post_nodes:
post_url = post_node.css('::attr(href)').extract_first("")
url_get = parse.urljoin(response.url, post_url)
yield Request(url=url_get, dont_filter=True, callback=self.parse_detail)
print(parse.urljoin(response.url, post_url))
next_urls = response.css('#warp .list15 .list_sort > a:nth-child(3) ::attr(href)').extract_first("")
if next_urls:
next_url = parse.urljoin(response.url, next_urls)
last_second_url = response.css('#warp .list15 .list_sort > a:nth-child(2) ::attr(href)').extract_first("")
if last_second_url != 'index248.htm':
yield Request(url=next_url, dont_filter=True, callback=self.parse_index)
reference_news_spider.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录