def parse(self, response):
self.logger.info('parse: {}'.format(response))
is_no_update = False
# Collect list of news from current page
articles_grid = response.css('li:not(.last) > div.grid')
articles = zip(articles_grid, [NEWS_GRID] * len(articles_grid))
articles += zip(response.css('div.topic'), [NEWS_HEADLINE])
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = None
if article[1] == NEWS_GRID:
url_selectors = article[0].css('h2 > a::attr(href)')
elif article[1] == NEWS_HEADLINE:
url_selectors = article[0].css('h1 > a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
self.logger.info('Url: {}'.format(url))
# Example: Minggu, 09 Oct 2016 15:14
info_selectors = article[0].css('div.reg::text')
if not info_selectors:
raise CloseSpider('info_selectors not found')
info = info_selectors.extract()[1]
# Example: 09 Oct 2016 15:14
info_time = info.split(',')[1].strip()
# Parse date information
try:
published_at_wib = datetime.strptime(info_time, '%d %b %Y %H:%M')
except ValueError as e:
raise CloseSpider('cannot_parse_date: {}'.format(e))
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# Collect news on next page
if response.css('div.bu.fr > a'):
next_page = response.css('div.bu.fr > a[rel="next"]::attr(href)').extract()[0]
next_page_url = response.urljoin(next_page)
yield Request(next_page_url, callback=self.parse)
# Collect news item
评论列表
文章目录