def parse(self, response):
self.logger.info('parse: %s' % response)
has_no_update = False
# Get list of news from the current page
for article in response.css('.col-sm-16 > .row > .col-sm-16 > .row'):
title = article.css('h4::text').extract_first()
url = article.css('a::attr(href)').extract_first()
time = article.css('.indexTime::text').extract_first() # 16:51
date = article.css('.indexDay::text').extract_first() # Sabtu, 15 Oktober 2016
date = date.split(',')[-1].strip() # 15 Oktober 2016
date_time = date + ' ' + time # 15 Oktober 2016 16:51
date_time = date_time.split(' ')
date_time = ' '.join([_(s) for s in date_time]) # Oktober => October
# Parse date information
try:
published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M')
except ValueError as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
has_no_update = True
break
# For each url we create new scrapy request
yield Request(url, callback=self.parse_news)
if has_no_update:
self.logger.info('Media have no update')
return
# Currently has no more pages
评论列表
文章目录