python类CloseSpider()的实例源码

luxe_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_news(self,response):
        item = response.meta.get("item",None)
        # #??????????????????????
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
        #     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        #
        #     delta = self.end_now-struct_date
        #     if delta.days == self.end_day:
        #         # pass
        #         raise CloseSpider('today scrapy end')
        soup = BeautifulSoup(response.body)
        news_content_group = soup.find("div",class_="entry-content group")
        #??????
        news_content_group.find("div",class_="related_posts").replace_with("")
        content = news_content_group.text.strip()
        item["content"] = content
        item["catalogue"] = u"????"
        yield item
qdaily_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse_article(self,response):
        #content,news_no,crawl_date
        item = response.meta.get("item",NewsItem())
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
        #     delta = self.end_now-struct_date
        #     print delta.days
        #     if delta.days == self.end_day:
        #         raise CloseSpider('today scrapy end')
        soup =BeautifulSoup(response.body)
        author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None
        abstract =  soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None
        content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None
        news_no = response.url.split("/")[-1][:-5]
        item["author"] = author
        item["abstract"] = abstract
        item["content"] = content
        item["crawl_date"] = NOW
        item["news_no"] = news_no
        yield item
index_spider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def parse_search(self, response):
        """
        @summary: ?????????????request???????
        @param response:start_requests()?????????????
        """
        # ???????????????????????"antispider"??
        # ????"antispider"???????????????????????????
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200) # ??????????????
            raise CloseSpider('antispider')
        # ext????????????????json????url???????????????
        ext = response.xpath(
            '//div[@class="wx-rb bg-blue wx-rb_v1 _item"][1]/@href').extract() # ?????????????????????????????ext??
        if not ext:
            spider_logger.error("Faild searching {0} !".format(response.meta['query']))
            return
        # ???????json???url?????????10?????????????1?(page=1????)?url
        json_url = "".join(ext).replace('/gzh?','http://weixin.sogou.com/gzhjs?')+'&cb=sogou.weixin_gzhcb&page=1&gzhArtKeyWord='
        cookies = response.meta['cookies']
        yield Request(json_url, callback= self.parse_index, cookies=cookies, meta ={'cookies':cookies})
pipelines.py 文件源码 项目:scrapy_rss 作者: woxcab 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def spider_opened(self, spider):
        try:
            file = open(spider.settings.get('FEED_FILE'), 'wb')
        except TypeError:
            raise NotConfigured('FEED_FILE parameter does not string or does not exist')
        except (IOError, OSError) as e:
            raise CloseSpider('Cannot open file {}: {}'.format(spider.settings.get('FEED_FILE', None), e))
        self.files[spider] = file
        feed_title = spider.settings.get('FEED_TITLE')
        if not feed_title:
            raise NotConfigured('FEED_TITLE parameter does not exist')
        feed_link = spider.settings.get('FEED_LINK')
        if not feed_link:
            raise NotConfigured('FEED_LINK parameter does not exist')
        feed_description = spider.settings.get('FEED_DESCRIPTION')
        if feed_description is None:
            raise NotConfigured('FEED_DESCRIPTION parameter does not exist')
        feed_exporter = spider.settings.get('FEED_EXPORTER', RssItemExporter)
        if isinstance(feed_exporter, six.string_types):
            feed_exporter = load_object(feed_exporter)
        if not issubclass(feed_exporter, RssItemExporter):
            raise TypeError("FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(feed_exporter))
        self.exporters[spider] = feed_exporter(file, feed_title, feed_link, feed_description)
        self.exporters[spider].start_exporting()
yfood.py 文件源码 项目:YelpCrawlSpider 作者: yjp999 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse(self, response):
        if response.status ==503:
            raise CloseSpider("denied by remote server")
        sel = Selector(response)
        appends = response.meta['appends']
        cityname = appends['city']
        smexp = appends['cat']
        xpath_exp = '//a[text()="Search for more '+smexp+'"]/@href'
        if cityname=='??':
            moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Hong+Kong', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=???%2C+Hong+Kong']
        elif cityname=='Adelaide':
            moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Adelaide%2C+Adelaide+South+Australia%2C+Australia', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Adelaide+South+Australia+5000']
        elif cityname=='Park La Brea':
            moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=South+La+Brea+Avenue%2C+Los+Angeles%2C+CA+90056', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Mid-Wilshire%2C+Los+Angeles%2C+CA', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=North+La+Brea+Avenue%2C+Los+Angeles%2C+CA']
        else:
            searchmore = sel.xpath(xpath_exp).extract()[0]
            moreLink = [response.urljoin(searchmore)]

        for link in moreLink:
            yield Request(url=link, callback=self.parseBegin, meta={'appends': appends}, dont_filter=True)
middlewares.py 文件源码 项目:scrapy-rotating-proxies 作者: TeamHG-Memex 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def process_request(self, request, spider):
        if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
            return
        proxy = self.proxies.get_random()
        if not proxy:
            if self.stop_if_no_proxies:
                raise CloseSpider("no_proxies")
            else:
                logger.warn("No proxies available; marking all proxies "
                            "as unchecked")
                self.proxies.reset()
                proxy = self.proxies.get_random()
                if proxy is None:
                    logger.error("No proxies available even after a reset.")
                    raise CloseSpider("no_proxies_after_reset")

        request.meta['proxy'] = proxy
        request.meta['download_slot'] = self.get_proxy_slot(proxy)
        request.meta['_rotating_proxy'] = True
meadin_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse_news(self,response):
        #content,news_date,news_no,crawl_date,referer_web
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",1)
        soup = BeautifulSoup(response.body)
        # news_date = item.get("news_date",None)
        #?????????
        news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
        #http://info.meadin.com/PictureNews/2938_1.shtml Exception
        if news_date:

            # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
            # delta = self.end_now-struct_date
            # if delta.days == self.end_day:
            #     raise CloseSpider('today scrapy end')
            referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
            #????
            art,content = None,None
            art = soup.find("div",class_="article js-article")
            if art:
                #?????
                art.find("div",class_="intro").replace_with("")
                content =art.text.strip()
            news_no =response.url.split("/")[-1].split("_")[0]
            item["news_date"]=news_date
            item["content"]=content
            item["referer_web"]=referer_web
            item["crawl_date"]=NOW
            item["news_no"]=news_no
            item = judge_news_crawl(item)
            if item:
                yield item
            else:
                self.flag = pageindex
        else:
            logger.warning("can't find news_date.the url is %s" % response.url)
thepaper_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse(self, response):
        #????
        html = response.body
        soup = BeautifulSoup(html,"lxml")
        #????????
        for i in self.fetch_newslist(soup):
            # raise CloseSpider(str(i['time'] == u"???"))
            # if i['time'] == "???": raise CloseSpider("today news end")
            request = scrapy.Request(i['news_url'],callback=self.parse_news)
            request.meta['item'] = i
            request.meta['pageindex'] = 1
            yield request

        #????????
        lasttime = "nothing"
        for i in  soup.select('div[class="news_li"]'):
            if i.attrs.has_key("lasttime"):
                lasttime =  i["lasttime"]
                break
        #?????url????
        # ???load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx=
        load_chosen = re.search(r'data.:."(.*)".+.masonry',html)
        page = 2
        if load_chosen :
            tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime)
            yield scrapy.Request(tp_url, callback=self.next_page_parse)
proc.py 文件源码 项目:EasyGoSpider 作者: Karmenzind 项目源码 文件源码 阅读 44 收藏 0 点赞 0 评论 0
def start_requests(self):
        # while len(self.finished) < len(self.all_urls):
        current_hour = time.strftime("%Y%m%d%H", time.localtime())
        if current_hour != START_HOUR:
            self.logger.info("It's already %s. Stopping..." % current_hour)
            return
        for url, item_idx in self.all_urls.iteritems():
            if not self.cookies:
                raise CloseSpider("No enough cookies.")
            if item_idx in self.finished:
                continue
            else:
                yield Request(url, callback=self.parse_item)
                # self.logger.info(u'Crawled %s / %s. Done :)' % (len(self.finished), len(self.all_urls)))
check_antispider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def process_response(request, response, spider):
        if "antispider" in response.url:
            spider_logger.error("recieve verification code in %s" % response.url) 
            raise CloseSpider('antispider')
        return response
index_spider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self,
                 query=None,
                 start_time=None,
                 end_time=None,
                 index_pages=None):
        """
        @summary: ?????????, ?????????????
        @param query: ???,???????
        @param start_time: ????????start_time???????????????????????
        @param end_time: ????????end_time?????
        @param index_pages: ?????????????
        """
        # ??????????????????????????
        if query:
            self.query = query # self.query????????????
        else:
            # ???????????????????????
            spider_logger.error("Spider need single search word each time!Check input!")
            raise CloseSpider('invaild search word')
        # ???????????????100??
        if start_time:
            self.from_time = start_time
        else:
            self.from_time = datetime.now()-timedelta(days=100)  # ????100??
        # ?????????????
        if end_time:
            self.end_time = end_time
        else:
            self.end_time = datetime.now()  # ????????
        # ???????
        if index_pages:
            self.index_pages = int(index_pages)
        else:
            self.index_pages = 10 # ????10?
index_spider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def parse_index(self, response):
        """
        @summary: ?????????????????Request??
        @param response: parse_search()?????????????
        @return: list????????????url???????????
        """
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200)
            raise CloseSpider('antispider')
        requests = []
        page_list = self._get_result(response)
        # ???????????????
        if not page_list:
            return requests
        next_page = True  # ????????
        # ???????????????
        for item in page_list:
            if isinstance(item, Request):  # ?????Request
                requests.append(item)
                next_page = False
                break
            if item['publish_time'] <= self.from_time:  # ????????self.from_time
                next_page = False
                break
            elif item['publish_time'] > self.end_time:  # ????????self.end_time
                continue
            else:
                req = Request(item['url'], self.parse_page)
                # ???????
                req.meta["item"] = item
                requests.append(req)
        # ?????,??????Request;???????
        if next_page and self._next_result_page(response):
            cookies = response.meta['cookies']
            requests.append(Request(self._next_result_page(response),callback=self.parse_index,cookies=cookies, meta ={'cookies':cookies}))
        return requests
index_spider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def parse_page(self, response):
        """
        @summary: ??????
        @param response: parse_index()?????????????
        @return: ?????_finish_item()??????
        """
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200)
            raise CloseSpider('antispider')
        item = response.meta["item"]
        return self._finish_item(item, response)
pipelines.py 文件源码 项目:finance_news_analysis 作者: pskun 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def process_eastmoney_gubalist_item(self, item, spider):
        status = item.get('status')
        if status is not None and status != 200:
            self.error_count += 1
            if self.error_count * 5 > self.success_count:
                raise CloseSpider(
                    'too many error occurred, shutdown gracefully.')
            return item

        if 'ticker_id' not in item or item['ticker_id'] == "":
            raise DropItem('??ticker_id')
        self.write_to_file(item, spider.name)
        pass
test_exporter.py 文件源码 项目:scrapy_rss 作者: woxcab 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def test_empty_feed(self):
        for partial_settings in itertools.chain.from_iterable(
                itertools.combinations(self.feed_settings.items(), r)
                for r in range(1, len(self.feed_settings))):
            partial_settings = dict(partial_settings)
            undefined_settings = [name.upper() for name in set(self.feed_settings) - set(partial_settings)]
            with self.assertRaisesRegexp(NotConfigured,
                                         '({})'.format('|'.join(undefined_settings))
                                            if len(undefined_settings) > 1 else undefined_settings[0],
                                         msg='The feed file, title, link and description must be specified, but the absence of {} is allowed'
                                             .format(undefined_settings)):
                with CrawlerContext(**partial_settings):
                    pass

        with self.assertRaises(CloseSpider):
            feed_settings = dict(self.feed_settings)
            feed_settings['feed_file'] = 'non/existent/filepath'
            with CrawlerContext(**feed_settings):
                pass

        with CrawlerContext(**self.feed_settings):
            pass

        with open(self.feed_settings['feed_file']) as data, \
             open(os.path.join(os.path.dirname(__file__), 'expected_rss', 'empty_feed.rss')) as expected:
            self.assertUnorderedXmlEquivalentOutputs(data.read(), expected.read())
all.py 文件源码 项目:Spider 作者: poluo 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def close_spider(self, reason):
        raise CloseSpider(reason=reason)

    # do something before spider close
arah.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False
        published_at_wib = ''

        try:
            # Get list of news from the current page
            articles = json.loads(response.text)

            for article in articles['contents']:
                url = article['friendlyURL']
                date = article['publishTime']
                published_at_wib = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                published_at = wib_to_utc(published_at_wib)

                if self.media['last_scraped_at'] >= published_at:
                    is_no_updated = True
                    break

                yield Request('http://pilkada.arah.com' + url, callback=self.parse_news)
        except:
            raise CloseSpider('article not found')

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Get more
        try:
            next_date = published_at_wib - timedelta(seconds=1)

            if self.media['last_scraped_at'] < wib_to_utc(next_date):
                yield Request('http://pilkada.arah.com/api/article/8/' + str(next_date)[:19],
                        callback=self.parse)
        except:
            pass

    # Collect news item
jawapos.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: %s' % response)
        has_no_update = False

        # Get list of news from the current page
        for article in response.css('.col-sm-16 > .row > .col-sm-16 > .row'):
            title = article.css('h4::text').extract_first()
            url = article.css('a::attr(href)').extract_first()            
            time = article.css('.indexTime::text').extract_first() # 16:51

            date = article.css('.indexDay::text').extract_first() # Sabtu, 15 Oktober 2016
            date = date.split(',')[-1].strip() # 15 Oktober 2016

            date_time = date + ' ' + time # 15 Oktober 2016 16:51
            date_time = date_time.split(' ')
            date_time = ' '.join([_(s) for s in date_time]) # Oktober => October

            # Parse date information
            try:
                published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                has_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if has_no_update:
            self.logger.info('Media have no update')
            return

        # Currently has no more pages
tempoco.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_news_pilkada(self, loader, response):
        date_selector = response.css('.block-judul-artikel > .tanggal::text')
        try:
            date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)

        if (self.media['last_scraped_at'] >= published_at):
            is_no_update = True
            self.logger.info('Media have no update')
            raise CloseSpider('finished')
        loader.add_value('published_at', published_at)

        title_selector = response.css('.block-judul-artikel > .judul-artikel')
        loader.add_value('title', title_selector.extract()[0])

        raw_content_selector = response.css('.block-artikel .p-artikel')
        raw_content_selector = raw_content_selector.xpath('//p[not(iframe)]')
        raw_content = ''
        for rsl in raw_content_selector:
            raw_content = raw_content + rsl.extract().strip()
        loader.add_value('raw_content', raw_content)

        author_name = ''
        for author_name_selector in reversed(raw_content_selector):
            author_name_selector = author_name_selector.css('strong::text')
            for tmp in reversed(author_name_selector.extract()):
                tmp = tmp.strip()
                if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
                    author_name = tmp
                    break
            if author_name:
                break
        author_name = ','.join(author_name.split(' | '))
        loader.add_value('author_name', author_name)
        loader.add_value('url', response.url)
cnnindonesia.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        # Note: no next page button on cnnindonesia, all is loaded here
        article_selectors = response.css('a.list_kontribusi');
        if not article_selectors:
            raise CloseSpider('article_selectors not found')

        for article in article_selectors:
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example: Jumat, 23/09/2016 21:17
            info_selectors = article.css('div.text > div > span.tanggal::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info = info_selectors.extract()[0]
            info_time = info.split(',')[1].strip()

            # Parse date information
            try:
                # Example: 23/09/2016 21:17
                published_at_wib = datetime.strptime(info_time, '%d/%m/%Y %H:%M')
            except ValueError as err:
                raise CloseSpider('cannot_parse_date: {}'.format(err))
            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break
            # For each url we create new scrapy Request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return
merdekacom.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        articles = json.loads(response.body)['response']
        for article in articles:
            # Example: 2016-10-12 15:16:04
            date_time_str = article['news_date_publish']

            # Parse date information
            try:
                published_at_wib = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
            except Exception as e:
                raise CloseSpider('cannot_parse_date: {}'.format(e))
            published_at = wib_to_utc(published_at_wib)

            if (self.media['last_scraped_at'] >= published_at):
                is_no_update = True
                break;

            for sub_article in article['news_content']:
                yield self.parse_news(article, sub_article)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Collect news on next page
        if len(articles) > 0:
            # Example: 'http://api.merdeka.com/mobile/gettag/pilgub-dki/0/20/L9pTAoWB269T&-E/'
            next_page_url = response.url.split('/')
            next_page_url[-4] = str(int(next_page_url[-4]) + 20)
            next_page_url = '/'.join(next_page_url)
            yield Request(next_page_url, callback=self.parse)

    # Collect news item
base_spider.py 文件源码 项目:frontoxy 作者: fabienvauchelles 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def check_error(self):
        # Stop spider if error has been raised in pipeline
        if hasattr(self, 'close_error'):
            raise CloseSpider(self.close_error)
feedback.py 文件源码 项目:aliexpress 作者: yangxue088 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and FeedbackSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
store_feedback.py 文件源码 项目:aliexpress 作者: yangxue088 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['storeId'][0])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
product.py 文件源码 项目:aliexpress 作者: yangxue088 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
store.py 文件源码 项目:aliexpress 作者: yangxue088 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(url[url.rfind('/') + 1:])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
order.py 文件源码 项目:aliexpress 作者: yangxue088 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and OrderSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
eol.py 文件源码 项目:gaokao 作者: EasyData 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def parse(self, response):

        data = json.loads(response.body)
        total = int(data['totalRecord']['num'])
        total_page = int(math.ceil(total/float(self.page_size)))

        if total == 0:
            raise CloseSpider('blocked')

        for i in self.parse_item(response):
            yield i

        for page in range(2, total_page+1):
            yield Request(url=self.get_url(page), callback=self.parse_item)
scraping.py 文件源码 项目:ws-backend-community 作者: lavalamp- 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def __check_for_close(self):
        """
        Check to see if this spider has been running for longer than the maximum amount
        of allowed time, and stop the spider if it has.
        :return: None
        """
        if self._start_time is None:
            self._start_time = DatetimeHelper.now()
        elapsed_time = (DatetimeHelper.now() - self.start_time).total_seconds()
        if elapsed_time > self.max_run_time:
            raise CloseSpider(
                "Spider run time exceeded maximum time of %s seconds. Closing."
                % (self.max_run_time,)
            )
database.py 文件源码 项目:livetv_mining 作者: taogeT 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def open_spider(self, spider):
        site_setting = spider.settings.get('SITE')
        if not site_setting:
            error_msg = 'Can not find the website configuration from settings.'
            spider.logger.error(error_msg)
            raise CloseSpider(error_msg)
        self.session = self.session_maker()
        site = self.session.query(LiveTVSite).filter(LiveTVSite.code == site_setting['code']).one_or_none()
        if not site:
            site = LiveTVSite(code=site_setting['code'], name=site_setting['name'],
                              description=site_setting['description'], url=site_setting['url'],
                              image=site_setting['image'], show_seq=site_setting['show_seq'])
            self.session.add(site)
            self.session.commit()
        self.site[site.code] = {'id': site.id, 'starttime': datetime.utcnow(), 'channels': {}}


问题


面经


文章

微信
公众号

扫码关注公众号