liputan6.py 文件源码-python代码片段

def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('div.article-snippet__info')
        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            info_selectors = article.css('div.article-snippet__date')
            info_selectors = info_selectors.css('.timeago::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            # Example '13 Okt 2016 16:10'
            info_time = info_selectors.extract()[0]
            # Example '13 Oct 2016 16:10'
            info_time = ' '.join([_(w) for w in info_time.split(' ')])

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time,
                    '%d %b %Y %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: {}'.format(e))

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy Request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

    # TODO: Collect news item