python类settings()的实例源码

spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def handle_form(self, url, form, meta):
        action = canonicalize_url(urljoin(url, form.action))
        if not self.link_extractor.matches(action):
            return
        if (meta['form'] == 'search' and
                self.settings.getbool('CRAZY_SEARCH_ENABLED') and
                action not in self.handled_search_forms and
                len(self.handled_search_forms) <
                self.settings.getint('MAX_DOMAIN_SEARCH_FORMS')):
            self.logger.debug('Found a search form at %s', url)
            self.handled_search_forms.add(action)
            for request_kwargs in search_form_requests(
                    url, form, meta,
                    search_terms=self.search_terms,
                    extra_search_terms=self.extra_search_terms):
                request_kwargs['meta'] = {'is_search': True}
                request_kwargs['cls'] = \
                    SplashFormRequest if self.use_splash else FormRequest
                yield request_kwargs
EuropythonSpyder.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(EuropythonSpyder())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"
PydataSpiderDetails.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def main():
    from scrapy.xlib.pydispatch import dispatcher

    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # define spyder for the crawler
    crawler.crawl(PydataSpiderDetails())

    print "STARTING ENGINE"
    crawler.start() #start  the crawler
    print "ENGINE STOPPED"
test_media_pipeline.py 文件源码 项目:scrapy-cdr 作者: TeamHG-Memex 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def make_crawler(**extra_settings):
    settings = Settings()
    settings['ITEM_PIPELINES'] = {
        'scrapy_cdr.media_pipeline.CDRMediaPipeline': 1,
        'tests.utils.CollectorPipeline': 100,
    }
    settings.update(extra_settings)
    runner = CrawlerRunner(settings)
    return runner.create_crawler(Spider)
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def start_requests(self):
        self.use_splash = using_splash(self.settings)
        for url in self.start_urls:
            yield self.make_request(url, callback=self.parse_first)
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def make_request(
            self, url, callback=None, meta=None, cls=None, **kwargs):
        callback = callback or self.parse
        cls = cls or (SplashRequest if self.use_splash else Request)
        if self.use_splash:
            settings = self.settings
            splash_args = {
                'lua_source': self.lua_source,
                'js_source': self.js_source,
                'run_hh': settings.getbool('RUN_HH'),
                'return_png': settings.getbool('SCREENSHOT'),
                'images_enabled': settings.getbool('IMAGES_ENABLED'),
            }
            for s in ['VIEWPORT_WIDTH', 'VIEWPORT_HEIGHT',
                      'SCREENSHOT_WIDTH', 'SCREENSHOT_HEIGHT']:
                if self.settings.get(s):
                    splash_args[s.lower()] = self.settings.getint(s)
            if self.settings.getbool('ADBLOCK'):
                splash_args['filters'] = 'fanboy-annoyance,easylist'
            if self.settings.getbool('FORCE_TOR'):
                splash_args['proxy'] = 'tor'
            kwargs.update(dict(
                args=splash_args,
                endpoint='execute',
                cache_args=['lua_source', 'js_source'],
            ))
        meta = meta or {}
        meta['avoid_dup_content'] = True
        return cls(url, callback=callback, meta=meta, **kwargs)
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def parse_first(self, response):
        allowed = allowed_re(
            response.url, self.settings.getbool('HARD_URL_CONSTRAINT'))
        if allowed not in self.allowed:
            self.allowed.add(allowed)
            # Reset link extractors to pick up with the latest self.allowed regexps
            self._reset_link_extractors()
            self.logger.info('Updated allowed regexps: %s', self.allowed)
        yield from self.parse(response)
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def text_cdr_item(self, response, *, follow_urls, metadata):
        if self.settings.get('FILES_STORE'):
            media_urls = self.media_urls(response, follow_urls)
        else:
            media_urls = []
        return text_cdr_item(
            response,
            crawler_name=self.settings.get('CDR_CRAWLER'),
            team_name=self.settings.get('CDR_TEAM'),
            # will be downloaded by UndercrawlerMediaPipeline
            objects=media_urls,
            metadata=metadata,
        )
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def _looks_like_logout(self, link, response):
        if not self.settings.getbool('AUTOLOGIN_ENABLED') or not \
                response.meta.get('autologin_active'):
            return False
        return link_looks_like_logout(link)
spiders.py 文件源码 项目:undercrawler 作者: TeamHG-Memex 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def _take_screenshot(self, response) -> Optional[str]:
        screenshot = response.data.get('png') if self.use_splash else None
        if not screenshot:
            return None
        if self._screenshot_dest is None:
            self._screenshot_dest = Path(
                self.settings.get('SCREENSHOT_DEST', 'screenshots'))
            self._screenshot_dest.mkdir(parents=True, exist_ok=True)
        path = self._screenshot_dest.joinpath(
            '{prefix}{uuid}.png'.format(
                prefix=self.settings.get('SCREENSHOT_PREFIX', ''),
                uuid=uuid.uuid4()))
        path.write_bytes(b64decode(screenshot))
        self.logger.debug('Saved %s screenshot to %s' % (response, path))
        return str(path)
crawler.py 文件源码 项目:ws-backend-community 作者: lavalamp- 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def get_scrapy_settings(self, item_pipeline=None, hostname=None):
        """
        Get a scrapy settings dictionary to use for crawling web applications.
        :param item_pipeline: The item pipeline configuration to configure in the settings.
        :param hostname: The hostname to request by default in all Scrapy requests.
        :return: A scrapy settings dictionary to use for crawling web applications.
        """
        item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline()
        return scrapy.settings.Settings(values={
            "CONCURRENT_ITEMS": self.concurrent_items,
            "CONCURRENT_REQUESTS": self.concurrent_requests,
            "DEFAULT_REQUEST_HEADERS": {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en",
                "Host": hostname,
            },
            "DEPTH_LIMIT": self.depth_limit,
            "DEPTH_PRIORITY": self.depth_priority,
            "DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory",
            "EXTENSIONS": {
                "scrapy.extensions.telnet.TelnetConsole": None,
            },
            "DOWNLOADER_MIDDLEWARES": {
                "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None,
                "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None,
            },
            "SPIDER_MIDDLEWARES": {
                "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None,
            },
            "DOWNLOAD_MAXSIZE": self.max_size,
            "HTTPERROR_ALLOW_ALL": self.allow_all_errors,
            "ITEM_PIPELINES": item_pipeline,
            "LOG_LEVEL": config.log_crawling_level,
            "TELNETCONSOLE_ENABLED": self.enable_telnet,
            "USER_AGENT": self.user_agent,
        })
crawler.py 文件源码 项目:ws-backend-community 作者: lavalamp- 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __crawl(self, spider_kwargs=None, settings=None):
        """
        Perform a crawl based on the contents of self._crawling_config.
        :param spider_kwargs: Keyword arguments to use to create a spider class.
        :param settings: Scrapy settings to use to crawl the remote endpoint.
        :return: None
        """
        print("SPIDER KWARGS ARE %s." % (spider_kwargs,))
        config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"]
        spider = self.get_spider_class_for_domain(**spider_kwargs)
        process = CrawlerProcess(settings)
        process.crawl(spider)
        process.start()
crawler.py 文件源码 项目:ws-backend-community 作者: lavalamp- 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def crawling_config(self):
        """
        Get a dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
        :return: A dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
        """
        return self._crawling_config

    # Representation and Comparison
crawler.py 文件源码 项目:ws-backend-community 作者: lavalamp- 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def crawl_endpoint_to_file(
            self,
            ip_address=None,
            port=None,
            hostname=None,
            use_ssl=False,
            use_sni=False,
            start_urls=[],
            in_separate_process=True,
    ):
        """
        Start crawling the given endpoint using the given list of URLs and write the results to
        a local file.
        :param ip_address: The IP address to crawl.
        :param port: The port where the application resides.
        :param hostname: The hostname to submit alongside all requests to the remote endpoint.
        :param use_ssl: Whether or not to use SSL to connect to the remote web service.
        :param use_sni: Whether or not to use SNI to connect to the remote web service.
        :param start_urls: A list of URLs to start crawling from.
        :param in_separate_process: Whether or not to spawn off a separate process for the crawl. This
        enables us to call this method multiple times in the same process, as a Twisted reactor can only
        be started and stopped once per process.
        :return: A tuple containing (1) the string containing the local file path where crawling
        results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file.
        """
        temp_file_path = FilesystemHelper.get_temporary_file_path()
        local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port)
        spider_kwargs = {
            "input_ip_address": ip_address,
            "input_start_urls": start_urls,
            "input_file_path": local_file_path,
            "input_hostname": hostname,
            "input_use_ssl": use_ssl,
            "input_use_sni": use_sni,
            "input_port": port,
        }
        pipeline_settings = self.__get_local_storage_item_pipeline()
        requested_hostname = hostname if hostname is not None else ip_address
        settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname)
        crawling_config = {
            "spider_kwargs": spider_kwargs,
            "settings": settings,
        }
        if in_separate_process:
            process = Process(target=self.__crawl, kwargs=crawling_config)
            process.start()
            process.join()
            process.terminate()
        else:
            self.__crawl(**crawling_config)
        return local_file_path, ScrapyResultWrapper.from_file(local_file_path)


问题


面经


文章

微信
公众号

扫码关注公众号