publications.py 文件源码

python
阅读 26 收藏 0 点赞 0 评论 0

项目:czl-scrape 作者: code4romania 项目源码 文件源码
def parse(self, response):
        for li_item in response.css('#content div.entry-content ul.lcp_catlist li'):
            title = li_item.css('h3.lcp_post a::text').extract_first().strip()
            text_date = li_item.css('::text').extract_first().strip()

            try:
                date_obj = datetime.datetime.strptime(text_date, '%d %B %Y')
                date = date_obj.date().isoformat()
            except ValueError:
                date = None

            paragraphs = li_item.xpath('p').xpath("string()").extract()
            description = '\n'.join(paragraphs)

            feedback_days = None
            feedback_date = self.get_feedback_date(description)
            if feedback_date:
                days_diff = feedback_date - date_obj
                feedback_days = days_diff.days

            links = li_item.css('a')
            documents = self.get_documents_from_links(links)

            item = JustPublication(
                title=title,
                type=self.get_type(title),
                identifier=self.slugify(title)[0:127],
                date=date,
                institution='justitie',
                description=description,
                documents=documents,
                contact=self.get_contacts(description),
                feedback_days=feedback_days
            )

            yield item

        paginationLinkEx = LinkExtractor(restrict_css='ul.lcp_paginator')
        pages = paginationLinkEx.extract_links(response)
        for page in pages:
            yield scrapy.Request(page.url, callback=self.parse)


        pass
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号