crawler.py 文件源码-python代码片段

def get_scrapy_settings(self, item_pipeline=None, hostname=None):
        """
        Get a scrapy settings dictionary to use for crawling web applications.
        :param item_pipeline: The item pipeline configuration to configure in the settings.
        :param hostname: The hostname to request by default in all Scrapy requests.
        :return: A scrapy settings dictionary to use for crawling web applications.
        """
        item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline()
        return scrapy.settings.Settings(values={
            "CONCURRENT_ITEMS": self.concurrent_items,
            "CONCURRENT_REQUESTS": self.concurrent_requests,
            "DEFAULT_REQUEST_HEADERS": {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en",
                "Host": hostname,
            },
            "DEPTH_LIMIT": self.depth_limit,
            "DEPTH_PRIORITY": self.depth_priority,
            "DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory",
            "EXTENSIONS": {
                "scrapy.extensions.telnet.TelnetConsole": None,
            },
            "DOWNLOADER_MIDDLEWARES": {
                "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None,
                "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None,
            },
            "SPIDER_MIDDLEWARES": {
                "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None,
            },
            "DOWNLOAD_MAXSIZE": self.max_size,
            "HTTPERROR_ALLOW_ALL": self.allow_all_errors,
            "ITEM_PIPELINES": item_pipeline,
            "LOG_LEVEL": config.log_crawling_level,
            "TELNETCONSOLE_ENABLED": self.enable_telnet,
            "USER_AGENT": self.user_agent,
        })