viva.py 文件源码

python
阅读 13 收藏 0 点赞 0 评论 0

项目:rojak 作者: pyk 项目源码 文件源码
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        article_selectors = response.css('ul.indexlist > li')
        if not article_selectors:
            raise CloseSpider('article_selectors not found')
        for article in article_selectors:
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example: 7 Oktober 2016 19:37
            info_selectors = article.css('div.upperdeck::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info = info_selectors.extract()[1]
            info = info.split(',')[1].replace('\t','').strip()
            # Example: 7 October 2016 19:37
            info_time = info.split(' ')
            info_time = ' '.join([_(s) for s in info_time])

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
            except ValueError as err:
                raise CloseSpider('cannot_parse_date: {}'.format(err))
            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break
            # For each url we create new scrapy Request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Collect news on next page
        tag_selectors = response.css('div.pagination > a')
        if not tag_selectors:
            raise CloseSpider('tag_selectors not found')
        for tag in tag_selectors:
            more_selectors = tag.css('a::text')
            if not more_selectors:
                raise CloseSpider('more_selectors not found')
            more = more_selectors.extract()[0]
            if more == 'NEXT':
                next_page = tag.css('a::attr(href)').extract()[0]
                next_page_url = response.urljoin(next_page)
                yield Request(next_page_url, callback=self.parse)

    # Collect news item
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号