def parse(self, response):
self.logger.info('parse: {}'.format(response))
is_no_update = False
# Collect list of news from current page
article_selectors = response.css('ul.indexlist > li')
if not article_selectors:
raise CloseSpider('article_selectors not found')
for article in article_selectors:
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example: 7 Oktober 2016 19:37
info_selectors = article.css('div.upperdeck::text')
if not info_selectors:
raise CloseSpider('info_selectors not found')
info = info_selectors.extract()[1]
info = info.split(',')[1].replace('\t','').strip()
# Example: 7 October 2016 19:37
info_time = info.split(' ')
info_time = ' '.join([_(s) for s in info_time])
# Parse date information
try:
published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
except ValueError as err:
raise CloseSpider('cannot_parse_date: {}'.format(err))
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy Request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# Collect news on next page
tag_selectors = response.css('div.pagination > a')
if not tag_selectors:
raise CloseSpider('tag_selectors not found')
for tag in tag_selectors:
more_selectors = tag.css('a::text')
if not more_selectors:
raise CloseSpider('more_selectors not found')
more = more_selectors.extract()[0]
if more == 'NEXT':
next_page = tag.css('a::attr(href)').extract()[0]
next_page_url = response.urljoin(next_page)
yield Request(next_page_url, callback=self.parse)
# Collect news item
评论列表
文章目录