def parse(self, response):
self.logger.info('parse: {}'.format(response))
is_no_update = False
# Collect list of news from current page
articles = json.loads(response.body)['response']
for article in articles:
# Example: 2016-10-12 15:16:04
date_time_str = article['news_date_publish']
# Parse date information
try:
published_at_wib = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
except Exception as e:
raise CloseSpider('cannot_parse_date: {}'.format(e))
published_at = wib_to_utc(published_at_wib)
if (self.media['last_scraped_at'] >= published_at):
is_no_update = True
break;
for sub_article in article['news_content']:
yield self.parse_news(article, sub_article)
if is_no_update:
self.logger.info('Media have no update')
return
# Collect news on next page
if len(articles) > 0:
# Example: 'http://api.merdeka.com/mobile/gettag/pilgub-dki/0/20/L9pTAoWB269T&-E/'
next_page_url = response.url.split('/')
next_page_url[-4] = str(int(next_page_url[-4]) + 20)
next_page_url = '/'.join(next_page_url)
yield Request(next_page_url, callback=self.parse)
# Collect news item
评论列表
文章目录