merdekacom.py 文件源码-python代码片段

def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        articles = json.loads(response.body)['response']
        for article in articles:
            # Example: 2016-10-12 15:16:04
            date_time_str = article['news_date_publish']

            # Parse date information
            try:
                published_at_wib = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
            except Exception as e:
                raise CloseSpider('cannot_parse_date: {}'.format(e))
            published_at = wib_to_utc(published_at_wib)

            if (self.media['last_scraped_at'] >= published_at):
                is_no_update = True
                break;

            for sub_article in article['news_content']:
                yield self.parse_news(article, sub_article)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Collect news on next page
        if len(articles) > 0:
            # Example: 'http://api.merdeka.com/mobile/gettag/pilgub-dki/0/20/L9pTAoWB269T&-E/'
            next_page_url = response.url.split('/')
            next_page_url[-4] = str(int(next_page_url[-4]) + 20)
            next_page_url = '/'.join(next_page_url)
            yield Request(next_page_url, callback=self.parse)

    # Collect news item