python类CloseSpider()的实例源码

travelweeklychina_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def parse_news(self,response):
        # print response.url,"response"
        PageKey = response.meta.get("topic_id")
        PageNumber =response.meta.get("PageNumber")
        flag_id =str(int(PageKey)-40037910)
        soup =BeautifulSoup(response.body,"lxml")
        #2016-07-13
        news_date = soup.find("time").text if soup.find("time") else None
        # print self.flag[flag_id],int(PageNumber)
        """
        ?????????self.flag[flag_id]??0??????????????
        ??????????????????????????????
        self.flag[flag_id]=????
        """
        if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]:
            #???????


            struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
            # print self.end_now,struct_date,"time"
            delta = self.end_now-struct_date
            # print delta.days,"delta day ~~~~~~~~~~~~~~~~"
            if delta.days > self.end_day:
                self.flag[str(flag_id)]=int(PageNumber)
                # print flag_id,"stop ~~~~~~"
                # raise CloseSpider('today scrapy end')
            else:

                head = soup.find("div",class_="post-head")
                topic,title,abstract=None,None,None
                if head:
                    topic = head.find("span",class_="category").text if head.find("span",class_="category") else None
                    title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None
                    abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None
                content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None
                news_no = response.url.split("/")[-1].split("?")[0]
                #TODO ????js??????
                item = NewsItem(title=title,topic=topic,
                                abstract=abstract,news_date=news_date,
                                content=content,news_no=news_no
                                ,crawl_date=NOW,news_url=response.url,catalogue='????')
                yield item
metrotvnews.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        articles_grid = response.css('li:not(.last) > div.grid')
        articles = zip(articles_grid, [NEWS_GRID] * len(articles_grid))
        articles += zip(response.css('div.topic'), [NEWS_HEADLINE])

        if not articles:
            raise CloseSpider('article not found')

        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = None
            if article[1] == NEWS_GRID:
                url_selectors = article[0].css('h2 > a::attr(href)')
            elif article[1] == NEWS_HEADLINE:
                url_selectors = article[0].css('h1 > a::attr(href)')

            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            self.logger.info('Url: {}'.format(url))

            # Example: Minggu, 09 Oct 2016 15:14
            info_selectors = article[0].css('div.reg::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info = info_selectors.extract()[1]
            # Example: 09 Oct 2016 15:14
            info_time = info.split(',')[1].strip()

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time, '%d %b %Y %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: {}'.format(e))

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break
            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Collect news on next page
        if response.css('div.bu.fr > a'):
            next_page = response.css('div.bu.fr > a[rel="next"]::attr(href)').extract()[0]
            next_page_url = response.urljoin(next_page)
            yield Request(next_page_url, callback=self.parse)

    # Collect news item
viva.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        article_selectors = response.css('ul.indexlist > li')
        if not article_selectors:
            raise CloseSpider('article_selectors not found')
        for article in article_selectors:
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example: 7 Oktober 2016 19:37
            info_selectors = article.css('div.upperdeck::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info = info_selectors.extract()[1]
            info = info.split(',')[1].replace('\t','').strip()
            # Example: 7 October 2016 19:37
            info_time = info.split(' ')
            info_time = ' '.join([_(s) for s in info_time])

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
            except ValueError as err:
                raise CloseSpider('cannot_parse_date: {}'.format(err))
            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break
            # For each url we create new scrapy Request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Collect news on next page
        tag_selectors = response.css('div.pagination > a')
        if not tag_selectors:
            raise CloseSpider('tag_selectors not found')
        for tag in tag_selectors:
            more_selectors = tag.css('a::text')
            if not more_selectors:
                raise CloseSpider('more_selectors not found')
            more = more_selectors.extract()[0]
            if more == 'NEXT':
                next_page = tag.css('a::attr(href)').extract()[0]
                next_page_url = response.urljoin(next_page)
                yield Request(next_page_url, callback=self.parse)

    # Collect news item
qureta.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('div.view-front > div.view-content > div.views-row')

        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('span.field-content a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example '19 Oct 2016'
            info_selectors = article.css('span.field-content::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info_time = info_selectors.extract()[1].strip()

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time, '%d %b %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request('http://www.qureta.com' + url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        if response.css('li.next'):
            next_page_url = response.css('li.next > a::attr(href)')[0].extract()
            yield Request('http://www.qureta.com' + next_page_url, callback=self.parse)

    # Collect news item
tirtoid.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('li.media')
        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example '02 November 2016'
            date_selectors = article.css('time::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                # Sanitize month - Indo month to Eng month
                # Example: 02 Nov 2016
                date[1] = sanitize(date[1])
                published_at_wib = datetime.strptime(' '.join(date),
                    '%d %b %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request('http:' + url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        try:
            next_page_url = response.xpath(
                    '//section[@class="pagination-numeric"]/span/a/@href')[-1].extract()

            if next_page_url and next_page_url != response.url:
                yield Request(next_page_url, callback=self.parse)
        except:
            pass

    # Collect news item
tempoco.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_news_metro(self, response):
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        date_selector = response.css('.artikel > div.block-tanggal::text')
        if not date_selector:
            return self.parse_news_pilkada(loader, response)
        try:
            date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        if (self.media['last_scraped_at'] >= published_at):
            is_no_update = True
            self.logger.info('Media have no update')
            raise CloseSpider('finished')
        loader.add_value('published_at', published_at)

        title_selector = response.css('.artikel > h1::text')
        if not title_selector:
            return loader.load_item()
        loader.add_value('title', title_selector.extract()[0])

        # Select all p which don't have iframe inside it
        raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
        if not raw_content_selector:
            return loader.load_item()
        raw_content = ''
        for rsl in raw_content_selector:
            raw_content = raw_content + rsl.extract().strip()

        # Go to next page while there is next page button
        next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
        if next_page_selector:
            return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))

        loader.add_value('raw_content', raw_content)

        # The author usually put inside <strong> tag, however, some news is not using <strong> tag.
        # NOTE: this block of code may need revision in the future
        author_name = ''
        for author_name_selector in reversed(raw_content_selector):
            author_name_selector = author_name_selector.css('strong::text')
            for tmp in reversed(author_name_selector.extract()):
                tmp = tmp.strip()
                if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
                    author_name = tmp
                    break
            if author_name:
                break
        author_name = ','.join(author_name.split(' | '))
        loader.add_value('author_name', author_name)
        return loader.load_item()
sindonews.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        for article in response.css('li > div.breaking-title'):
            # http://metro.sindonews.com/read/1146316/171/penyidik-bareskrim-mulai-dalami-video-dugaan-penistaan-agama-1476179831
            url_selectors = article.css('a::attr(href)')

            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example 'Kamis, 13 Oktober 2016 - 11:18 WIB'
            date_time_str_selectors = article.css('p::text')

            if not date_time_str_selectors:
                raise CloseSpider('date_time_str_selectors not found')

            date_time_str = date_time_str_selectors.extract()[0]

            # Parse date information
            # Example '13 Oktober 2016 - 11:18'
            date_time_str = date_time_str.split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
            try:
                published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
            except Exception as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        for next_button in response.css('.mpaging > ul > li'):
            if len(next_button.css('a:not(.active) > .fa-angle-right')) > 0:
              next_page = next_button.css('a::attr(href)').extract()[0]
              next_page_url = response.urljoin(next_page)
              yield Request(next_page_url, callback=self.parse)
              break

    # Collect news item
hallojakarta.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('article')
        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example: 'Monday, 24/11/2016 | 13:54'
            date_selectors = article.css('time::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                published_at_wib = datetime.strptime(' '.join(date[1:]), '%d/%m/%Y | %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        try:
            next_page_url = response.css('div.pagination > a.next::attr(href)').extract()[0]

            if next_page_url:
                yield Request(next_page_url, callback=self.parse)
        except:
            pass

    # Collect news item
republikaonline.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('div.wp-terhangat > div.item3')

        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example 'Wednesday, 02 November 2016'
            date_selectors = article.css('span.date::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                # Sanitize month - Indo month to Eng month
                # Example: Wednesday, 02 Nov 2016
                date[2] = sanitize(date[2])
                published_at_wib = datetime.strptime(' '.join(date[1:]),
                    '%d %b %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            # if it's news from before 2015, drop them
            if self.media['last_scraped_at'] >= published_at or int(date[-1]) < 2015:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        try:
            next_page_url = response.css('nav > ul > li > a::attr(href)').extract()[-1]

            if next_page_url:
               yield Request(next_page_url, callback=self.parse)
        except:
            pass

    # Collect news item
kompas.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        is_no_update = False

        news_selector = response.css("ul.clearfix > li > div.tleft")
        if not news_selector:
            raise CloseSpider('news_selectors not found')
        for news in news_selector:
            url_selectors = news.css("div.tleft > h3 > a::attr(href)")
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            # http://megapolitan.kompas.com/read/xml/2016/10/18/17244781/ini.alat.peraga.kampanye.yang.boleh.dibuat.cagub-cawagub.dki
            # http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.2016.10.15.07300081&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1
            url = url_selectors.extract()[0]
            url = 'http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.' + '.'.join(url.split('/')[-5:-1]) + '&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1'

            date_selectors = news.css("div.grey.small::text")
            if not date_selectors:
                raise CloseSpider('date_selectors not found')
            raw_date = date_selectors.extract()[0]

            # Parse date information
            try:
                published_at = self.convert_date(raw_date);
            except Exception as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url=url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # For kompas case, we don't rely on the pagination
        # Their pagination is max 17 pages, the truth is they have 25 pages
        if self.first_time:
            template_url = 'http://lipsus.kompas.com/topikpilihanlist/3754/{}/Pilkada.DKI.2017'
            for i in xrange(25):
                page = i + 1
                next_url = template_url.format(page)
                yield Request(next_url, callback=self.parse)
            self.first_time = False
liputan6.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 12 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('div.article-snippet__info')
        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            info_selectors = article.css('div.article-snippet__date')
            info_selectors = info_selectors.css('.timeago::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            # Example '13 Okt 2016 16:10'
            info_time = info_selectors.extract()[0]
            # Example '13 Oct 2016 16:10'
            info_time = ' '.join([_(w) for w in info_time.split(' ')])

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time,
                    '%d %b %Y %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: {}'.format(e))

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy Request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

    # TODO: Collect news item
nusanews.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('article > div > div.post-content')

        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a.timestamp-link::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example 'Sabtu, November 19, 2016'
            date_selectors = article.css('a.timestamp-link > abbr::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                # Sanitize month - Indo month to Eng month
                # Example: Nov 19 2016
                date[1] = sanitize(date[1])
                published_at_wib = datetime.strptime(' '.join(date[1:]),
                    '%b %d, %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        if len(articles) > 0:
            try:
                yield Request('http://www.nusanews.co/search/label/Pilkada?updated-max=' +
                        str(published_at_wib).replace(' ','T') + '%2B07:00&max-results=20', callback=self.parse)
            except Exception as e:
                pass

    # Collect news item
yfood.py 文件源码 项目:YelpCrawlSpider 作者: yjp999 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parseBegin(self, response):
        if response.status ==503:
            raise CloseSpider("denied by remote server")
        sel = Selector(response)
        appends = response.meta['appends']
        cityName = appends['city']
        category = appends['cat']

        locations = self.getLocations(response.body)

        if locations == []:
            # self.logger.error("location is []: %s\t%s", response.url, str(cityName))
            return


        div_a = sel.xpath('//li[@class="regular-search-result"]/div/div[@class="biz-listing-large"]')
        for ii, div in enumerate(div_a):
            # pdb.set_trace()
            main = div.xpath('./div[1]/div/div[2]/h3/span/a[@class="biz-name"]')
            item = FoodItem()
            url = main.xpath('./@href').extract()
            item['url'] = response.urljoin(url[0])
            item['name'] = main.xpath('./span/text()').extract()[0]
            # pdb.set_trace()
            second = div.xpath('./div[2]')
            address = second.xpath('./address').extract()
            region = second.xpath('./span[@class="neighborhood-str-list"]/text()').extract()
            if address:
                item['address'] = self.filtertags(address[0])
            else:
                item['address'] = ""
            if region:
                item['region'] = (region[0]).strip()
            else:
                item['region'] = ""
            item['city'] = cityName.strip()
            item['category'] = category
            item['location'] = eval(locations[ii])
            yield item

        time.sleep(1.0)
        nextPage = sel.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href').extract()
        if nextPage:
            nextLink = response.urljoin(nextPage[0])
            yield Request(url=nextLink, callback=self.parseBegin, meta={'appends':appends}, dont_filter=True)


问题


面经


文章

微信
公众号

扫码关注公众号