def get_chapterurl(self, response):
resp = BeautifulSoup(response.text, 'lxml')
item = DingdianItem()
tds = resp.find('table').find_all('td')
category = resp.find('table').find('a').get_text()
author = tds[1].get_text()
base_url = resp.find(
'p', class_='btnlinks').find(
'a', class_='read')['href']
novel_id = str(base_url)[-6:-1].replace('/', '')
serialstatus = tds[2].get_text()
serialnumber = tds[4].get_text()
item['name'] = str(response.meta['name']).replace('\xa0', '')
item['novelurl'] = response.meta['url']
item['category'] = str(category).replace('/', '')
item['author'] = str(author).replace('\xa0', '')
item['novel_id'] = novel_id
item['serialstatus'] = str(serialstatus).replace('\xa0', '')
item['serialnumber'] = str(serialnumber).replace('\xa0', '')
yield item
yield Request(url=base_url, callback=self.get_chapter, meta={'novel_id': novel_id})
评论列表
文章目录