def parse_item(self, response):
title = response.xpath('//h1[@class="title"]/text()').extract()[0]
body = response.xpath('//div[@class="show-content"]').extract()[0]
attr = response.xpath('//script[@data-name="note"]/text()').extract()
images = response.xpath('//div[@class="image-package"]/img/@src').extract()
notes = json.loads(attr[0].strip())
# ??markdown ??
h = html2text.HTML2Text()
h.ignore_links = False
h.inline_links = False
content = h.handle(body)
item = JianshuItem()
item["title"] = title
item["content"] = content.replace('-\n', '-').replace('\n?', '?')
item["url"] = notes['url']
item["slug"] = notes['slug']
item["views_count"] = notes['views_count']
item["likes_count"] = notes['likes_count']
item["images"] = images
yield item
评论列表
文章目录