def link_parse(self, response):
deeps = get_project_settings()['SPIDER_DEEP']
# ??????????????
links = response.xpath("//article/a/@href").extract()
if len(links) == 0:
yield self.parse_content(response)
else:
for link_item in links:
yield Request(DOMAIN + link_item, callback=self.parse_content)
# ??????
link_page = response.xpath("//div[@class='pagination']/ul/li/a/@href").extract()
print "link_page:", link_page
for page_item in link_page:
page_id_list = page_item.split("_")
this_page_list = response.url.split("_")
this_index = 1
if len(this_page_list) == 3:
this_index = this_page_list[-1].replace('.html', '')
if len(page_id_list) == 3 and int(this_index) < int(page_id_list[-1].replace('.html', '')) < deeps:
print page_item
yield Request(DOMAIN + page_item, callback=self.link_parse)
评论列表
文章目录