kompas.py 文件源码

python
阅读 13 收藏 0 点赞 0 评论 0

项目:rojak 作者: pyk 项目源码 文件源码
def parse(self, response):
        is_no_update = False

        news_selector = response.css("ul.clearfix > li > div.tleft")
        if not news_selector:
            raise CloseSpider('news_selectors not found')
        for news in news_selector:
            url_selectors = news.css("div.tleft > h3 > a::attr(href)")
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            # http://megapolitan.kompas.com/read/xml/2016/10/18/17244781/ini.alat.peraga.kampanye.yang.boleh.dibuat.cagub-cawagub.dki
            # http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.2016.10.15.07300081&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1
            url = url_selectors.extract()[0]
            url = 'http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.' + '.'.join(url.split('/')[-5:-1]) + '&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1'

            date_selectors = news.css("div.grey.small::text")
            if not date_selectors:
                raise CloseSpider('date_selectors not found')
            raw_date = date_selectors.extract()[0]

            # Parse date information
            try:
                published_at = self.convert_date(raw_date);
            except Exception as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url=url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # For kompas case, we don't rely on the pagination
        # Their pagination is max 17 pages, the truth is they have 25 pages
        if self.first_time:
            template_url = 'http://lipsus.kompas.com/topikpilihanlist/3754/{}/Pilkada.DKI.2017'
            for i in xrange(25):
                page = i + 1
                next_url = template_url.format(page)
                yield Request(next_url, callback=self.parse)
            self.first_time = False
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号