python类spider_closed()的实例源码

scrapyjiji.py 文件源码 项目:scrapyjiji 作者: sbourdelin 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        """Handle the spider_closed event to save the map"""

        # create the special marker for all the ads without geocode
        print "found %d items without geocode" % (len(self.no_geocode))
        if len(self.no_geocode) > 0:
            html = ""
            for x in self.no_geocode:
                html += "<a href=%s target=_blank>%s</a> : %s<br>" % (x["url"], x["title"], x["price"])
            iframe  = folium.element.IFrame(html=html, width=500, height=100)
            popup   = folium.Popup(iframe, max_width=500)
            folium.Marker(MAP_LATLNG,
                          popup=popup,
                          icon=folium.Icon()).add_to(self.m_map)

        print "found %d new items" % (self.new_items)
        pickle.dump(self.m_list, open(DATABASE, 'wb'))
        self.m_map.save('map.html')
proxy_spider.py 文件源码 项目:ip_proxy_pool 作者: leeyis 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def __init__(self,rule):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.rule = rule
        self.name = rule.name
        self.allowed_domains = rule.allowed_domains.split(',')
        self.start_urls = rule.start_urls.split(',')
        rule_list = []

        # ??`???`???
        if len(rule.next_page):
            rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))

        rule_list.append(Rule(LinkExtractor(
            allow=rule.allow_url.split(','),
            unique=True),
            follow=True,
            callback='parse_item'))

        self.rules = tuple(rule_list)
        super(ProxySpiderSpider, self).__init__()
pipelines.py 文件源码 项目:feeds 作者: nblock 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def from_crawler(cls, crawler):
        try:
            output_path = (
                crawler.settings.get('FEEDS_CONFIG')['feeds']['output_path']
            )
        except (KeyError, TypeError):
            output_path = 'output'
        try:
            output_url = (
                crawler.settings.get('FEEDS_CONFIG')['feeds']['output_url']
            )
        except (KeyError, TypeError):
            output_url = None
        pipeline = cls(output_path=output_path, output_url=output_url)
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
pipelines.py 文件源码 项目:Spider 作者: poluo 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def process_item(self, item, spider):
        if isinstance(item, AirbnbItem):
            self.room_count += 1
            if self.room_count > 100000:
                self.room_count = 0
                self.room_file_count += 1
                self.spider_closed(spider, mode=1)
                self.spider_opened(spider, mode=1)
            self.exporter_room.export_item(item)
        elif isinstance(item, UserItem):
            self.user_count += 1
            if self.user_count > 100000:
                self.user_count = 0
                self.user_file_count += 1
                self.spider_closed(spider, mode=2)
                self.spider_opened(spider, mode=2)
            self.exporter_user.export_item(item)
        else:
            logger.info('Some error happened!')
jenkins_spider.py 文件源码 项目:Charlie 作者: nxintech 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def run_spider():
    settings = Settings()
    settings.set('ITEM_PIPELINES', {
        '__main__.JsonWriterPipeline': 100
    })

    # enable remote sever certificate verification
    # see http://doc.scrapy.org/en/latest/topics/settings.html#downloader-clientcontextfactory
    settings.set('DOWNLOADER_CLIENTCONTEXTFACTORY',
                 'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'
                 )

    # uncomment below line to enable the logging for debug
    # configure_logging()

    crawler = Crawler(JenkinsJobSpider, settings)
    crawler.signals.connect(callback, signal=signals.spider_closed)
    crawler.crawl()
    reactor.run()
run_all_spiders.py 文件源码 项目:alltheplaces 作者: alltheplaces 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def spider_closed(spider):
        spider_stats[spider.name] = {
            'finish_reason': spider.crawler.stats.get_value('finish_reason'),
            'duration': (
                spider.crawler.stats.get_value('finish_time') -
                spider.crawler.stats.get_value('start_time')).total_seconds(),
            'item_scraped_count':
                spider.crawler.stats.get_value('item_scraped_count'),
        }

        print("Spider %s closed (%s) after %0.1f sec, %d items" % (
            spider.name,
            spider.crawler.stats.get_value('finish_reason'),
            (spider.crawler.stats.get_value('finish_time') -
                spider.crawler.stats.get_value('start_time')).total_seconds(),
            spider.crawler.stats.get_value('item_scraped_count') or 0,
        ))
pipelines.py 文件源码 项目:scrapy_site 作者: hl10502 项目源码 文件源码 阅读 69 收藏 0 点赞 0 评论 0
def __init__(self):
        dispatcher.connect(self.spider_opended, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        dispatcher.connect(self.engine_stopped, signals.engine_stopped)
        dispatcher.connect(self.engine_started, signals.engine_started)

        # ????????????scrapy_site??????
        self.curpath = os.getcwd()
        #?????????????
        self.spidername_filepath = self.curpath + "/scrapy_site/msg/"

        # ?????keyword.conf????????
        self.keywordsDict = dict()
        self.getKeywords()

        #????????????
        self.webnamesDict = dict()
        self.getWebnames()

        # ????
        self.msgDict = dict()

        SavePipeline.initCount = SavePipeline.initCount + 1
jobbole.py 文件源码 项目:ArticleSpider 作者: mtianyan 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self):
        self.fail_urls=[]
        dispatcher.connect(self.handle_spider_cosed, signals.spider_closed)
downloadhandlers.py 文件源码 项目:NetEaseMusicCrawler 作者: yaochao 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, settings):
        self.options = settings.get('PHANTOMJS_OPTIONS', {})  # ???
        max_run = settings.get('PHANTOMJS_MAXRUN', 10)  # PhantomJS ???????????, ??10
        self.sem = defer.DeferredSemaphore(max_run)
        self.queue = Queue.LifoQueue(maxsize=max_run)  # LifoQueue ??????
        SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
initiatives.py 文件源码 项目:tipi-engine 作者: CIECODE-Madrid 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self,*a, **kw):
        super(StackSpider,self).__init__(*a, **kw)
        self.time = datetime.datetime.now()
        self.congress = Congress()
        self.members = self.congress.searchAll("diputados")
        self.groups = self.congress.searchAll("grupos")
        dispatcher.connect(self.whenFinish, signals.spider_closed)
extension.py 文件源码 项目:tipi-engine 作者: CIECODE-Madrid 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __init__(self, crawler):
        self.crawler = crawler
        self.initiatives = 0
        self.amendments = 0
        self.finishtext = 0
        self.responses = 0
        self.members = 0
                # connect the extension object to signals
        crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
extension.py 文件源码 项目:tipi-engine 作者: CIECODE-Madrid 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        self.crawler.stats.set_value('item/initiatives', self.initiatives)
        self.crawler.stats.set_value('item/amendments', self.amendments)
        self.crawler.stats.set_value('item/finishtext', self.finishtext)
        self.crawler.stats.set_value('item/responses', self.responses)
        self.crawler.stats.set_value('item/members', self.responses)
pipelines.py 文件源码 项目:job_scraper 作者: wlabatey 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
pipelines.py 文件源码 项目:job_scraper 作者: wlabatey 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
scrapyjiji.py 文件源码 项目:scrapyjiji 作者: sbourdelin 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def __init__(self, *a, **kw):
        """Attach a callback to the spider_closed signal"""
        super(Kijiji, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        if USE_DB is True:
            self.open_database()
            if DRAW_ALL_DB is True and DRAW_NEW_AD_ONLY is False:
                # add already know marker
                for x in self.m_list:
                    self.add_marker(x, False)
tianqi.py 文件源码 项目:weather 作者: awolfly9 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def __init__(self, *a, **kw):
        super(TianqiSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self.sql = SqlHelper()
        self.weather_table_name = config.weather_table
        self.citys = []

        self.init()
tianqi.py 文件源码 项目:weather 作者: awolfly9 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        self.log('spider_closed ???????????????')
pipelines.py 文件源码 项目:ssp-transparencia 作者: eltermann 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
pipelines.py 文件源码 项目:ssp-transparencia 作者: eltermann 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        for exporter in self.exporters.values():
            exporter.finish_exporting()
        for file in self.files:
            file.close()
stats.py 文件源码 项目:BlogSpider 作者: hack4code 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def from_crawler(cls, crawler):
        ext = cls(crawler.stats)
        crawler.signals.connect(ext.spider_opened,
                                signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed,
                                signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped,
                                signal=signals.item_scraped)

        return ext
stats.py 文件源码 项目:BlogSpider 作者: hack4code 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        value = self.stats.get_value('item_scraped_count',
                                     0)
        save_stats(spider.settings['SPIDER_STATS_URL'],
                   spider._id,
                   value)
        if spider.settings['BOT_NAME'] != 'TestSpider':
            logger.info('spider[%s] crawled %d articles',
                        spider.name,
                        value)
            if value == 0:
                update_spider_stats(spider,
                                    {'fail': 1})
proxy_spider.py 文件源码 项目:ip_proxy_pool 作者: leeyis 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        print "spider is closed!"
        session = loadSession()
        log = session.query(SpiderCrawlLog).filter(SpiderCrawlLog.spiderID == self.rule.id
                                                   and SpiderCrawlLog.endTime is None
                                                   ).first()
        log.endTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        session.commit()

        pass
pipelines.py 文件源码 项目:finance_news_analysis 作者: pskun 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
pipelines.py 文件源码 项目:finance_news_analysis 作者: pskun 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        file = self.files.pop(spider.name)
        file.close()
        pass
pipelines.py 文件源码 项目:alsam_mi_ki 作者: mersanuzun 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
pipelines.py 文件源码 项目:alsam_mi_ki 作者: mersanuzun 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        self.file.seek(-2, os.SEEK_END)
        self.file.truncate()
        self.file.write(']')
        self.file.close()
middlewares.py 文件源码 项目:scrapy-training 作者: scrapinghub 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def from_crawler(cls, crawler):
        m = cls()
        if not crawler.settings.getbool('SELENIUM_ENABLED'):
            raise NotConfigured()
        crawler.signals.connect(m.spider_closed, signal=signals.spider_closed)
        return m
middlewares.py 文件源码 项目:scrapy-training 作者: scrapinghub 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        self.driver.close()
pipelines.py 文件源码 项目:feeds 作者: nblock 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def spider_closed(self, spider):
        # Add feed header(s) at the end so they can be dynamic.
        for feed_header in iterate_spider_output(spider.feed_headers()):
            self._exporters[spider].export_item(feed_header)
        self._exporters[spider].finish_exporting()
        self._exporters.pop(spider)
exporter_json_lines.py 文件源码 项目:scrapy_project 作者: zhanghe06 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline


问题


面经


文章

微信
公众号

扫码关注公众号