def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
# Get list of news from the current page
articles = response.css('li.media')
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example '02 November 2016'
date_selectors = article.css('time::text')
if not date_selectors:
raise CloseSpider('date_selectors not found')
# Parse date information
try:
date = date_selectors.extract()[0].split(' ')
# Sanitize month - Indo month to Eng month
# Example: 02 Nov 2016
date[1] = sanitize(date[1])
published_at_wib = datetime.strptime(' '.join(date),
'%d %b %Y')
except ValueError as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request('http:' + url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# try getting next page
try:
next_page_url = response.xpath(
'//section[@class="pagination-numeric"]/span/a/@href')[-1].extract()
if next_page_url and next_page_url != response.url:
yield Request(next_page_url, callback=self.parse)
except:
pass
# Collect news item
评论列表
文章目录