arah.py 文件源码

python
阅读 18 收藏 0 点赞 0 评论 0

项目:rojak 作者: pyk 项目源码 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False
        published_at_wib = ''

        try:
            # Get list of news from the current page
            articles = json.loads(response.text)

            for article in articles['contents']:
                url = article['friendlyURL']
                date = article['publishTime']
                published_at_wib = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                published_at = wib_to_utc(published_at_wib)

                if self.media['last_scraped_at'] >= published_at:
                    is_no_updated = True
                    break

                yield Request('http://pilkada.arah.com' + url, callback=self.parse_news)
        except:
            raise CloseSpider('article not found')

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Get more
        try:
            next_date = published_at_wib - timedelta(seconds=1)

            if self.media['last_scraped_at'] < wib_to_utc(next_date):
                yield Request('http://pilkada.arah.com/api/article/8/' + str(next_date)[:19],
                        callback=self.parse)
        except:
            pass

    # Collect news item
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号