def parse_songci(self, response):
item = SongCiItem()
item['url'] = response.url
full_title = response.css('div.son1>h1::text').extract_first()
if full_title:
try:
item['tune_name'], item['title'] = full_title.split('·')
except ValueError:
item['title'] = full_title
son2_p = response.css('div.son2>p')
for p in son2_p:
for name, field in {'??': 'dynasty', '??': 'author'}.items():
if name in p.css('::text').extract_first():
item[field] = p.css('::text').extract()[1]
content = ''.join(response.css('div#cont::text').extract()).strip()
if content:
item['content'] = content
else:
all_p_texts = son2_p.css('::text').extract()
try:
item['content'] = '\n'.join(all_p_texts[all_p_texts.index('???') + 1:]).strip()
except ValueError:
self.logger.error('Cannot parse item. url=%s', response.url)
yield item
评论列表
文章目录