def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
published_at_wib = ''
try:
# Get list of news from the current page
articles = json.loads(response.text)
for article in articles['contents']:
url = article['friendlyURL']
date = article['publishTime']
published_at_wib = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_updated = True
break
yield Request('http://pilkada.arah.com' + url, callback=self.parse_news)
except:
raise CloseSpider('article not found')
if is_no_update:
self.logger.info('Media have no update')
return
# Get more
try:
next_date = published_at_wib - timedelta(seconds=1)
if self.media['last_scraped_at'] < wib_to_utc(next_date):
yield Request('http://pilkada.arah.com/api/article/8/' + str(next_date)[:19],
callback=self.parse)
except:
pass
# Collect news item
评论列表
文章目录