sindonews.py 文件源码-python代码片段

def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        for article in response.css('li > div.breaking-title'):
            # http://metro.sindonews.com/read/1146316/171/penyidik-bareskrim-mulai-dalami-video-dugaan-penistaan-agama-1476179831
            url_selectors = article.css('a::attr(href)')

            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example 'Kamis, 13 Oktober 2016 - 11:18 WIB'
            date_time_str_selectors = article.css('p::text')

            if not date_time_str_selectors:
                raise CloseSpider('date_time_str_selectors not found')

            date_time_str = date_time_str_selectors.extract()[0]

            # Parse date information
            # Example '13 Oktober 2016 - 11:18'
            date_time_str = date_time_str.split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
            try:
                published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
            except Exception as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        for next_button in response.css('.mpaging > ul > li'):
            if len(next_button.css('a:not(.active) > .fa-angle-right')) > 0:
              next_page = next_button.css('a::attr(href)').extract()[0]
              next_page_url = response.urljoin(next_page)
              yield Request(next_page_url, callback=self.parse)
              break

    # Collect news item