def parse(self, response):
is_no_update = False
news_selector = response.css("ul.clearfix > li > div.tleft")
if not news_selector:
raise CloseSpider('news_selectors not found')
for news in news_selector:
url_selectors = news.css("div.tleft > h3 > a::attr(href)")
if not url_selectors:
raise CloseSpider('url_selectors not found')
# http://megapolitan.kompas.com/read/xml/2016/10/18/17244781/ini.alat.peraga.kampanye.yang.boleh.dibuat.cagub-cawagub.dki
# http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.2016.10.15.07300081&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1
url = url_selectors.extract()[0]
url = 'http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.' + '.'.join(url.split('/')[-5:-1]) + '&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1'
date_selectors = news.css("div.grey.small::text")
if not date_selectors:
raise CloseSpider('date_selectors not found')
raw_date = date_selectors.extract()[0]
# Parse date information
try:
published_at = self.convert_date(raw_date);
except Exception as e:
raise CloseSpider('cannot_parse_date: %s' % e)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request(url=url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# For kompas case, we don't rely on the pagination
# Their pagination is max 17 pages, the truth is they have 25 pages
if self.first_time:
template_url = 'http://lipsus.kompas.com/topikpilihanlist/3754/{}/Pilkada.DKI.2017'
for i in xrange(25):
page = i + 1
next_url = template_url.format(page)
yield Request(next_url, callback=self.parse)
self.first_time = False
评论列表
文章目录