republikaonline.py 文件源码-python代码片段

def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('div.wp-terhangat > div.item3')

        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example 'Wednesday, 02 November 2016'
            date_selectors = article.css('span.date::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                # Sanitize month - Indo month to Eng month
                # Example: Wednesday, 02 Nov 2016
                date[2] = sanitize(date[2])
                published_at_wib = datetime.strptime(' '.join(date[1:]),
                    '%d %b %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            # if it's news from before 2015, drop them
            if self.media['last_scraped_at'] >= published_at or int(date[-1]) < 2015:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        try:
            next_page_url = response.css('nav > ul > li > a::attr(href)').extract()[-1]

            if next_page_url:
               yield Request(next_page_url, callback=self.parse)
        except:
            pass

    # Collect news item