def parse(self, response):
for li_item in response.css('#content div.entry-content ul.lcp_catlist li'):
title = li_item.css('h3.lcp_post a::text').extract_first().strip()
text_date = li_item.css('::text').extract_first().strip()
try:
date_obj = datetime.datetime.strptime(text_date, '%d %B %Y')
date = date_obj.date().isoformat()
except ValueError:
date = None
paragraphs = li_item.xpath('p').xpath("string()").extract()
description = '\n'.join(paragraphs)
feedback_days = None
feedback_date = self.get_feedback_date(description)
if feedback_date:
days_diff = feedback_date - date_obj
feedback_days = days_diff.days
links = li_item.css('a')
documents = self.get_documents_from_links(links)
item = JustPublication(
title=title,
type=self.get_type(title),
identifier=self.slugify(title)[0:127],
date=date,
institution='justitie',
description=description,
documents=documents,
contact=self.get_contacts(description),
feedback_days=feedback_days
)
yield item
paginationLinkEx = LinkExtractor(restrict_css='ul.lcp_paginator')
pages = paginationLinkEx.extract_links(response)
for page in pages:
yield scrapy.Request(page.url, callback=self.parse)
pass
评论列表
文章目录