def get_scrapy_settings(self, item_pipeline=None, hostname=None):
"""
Get a scrapy settings dictionary to use for crawling web applications.
:param item_pipeline: The item pipeline configuration to configure in the settings.
:param hostname: The hostname to request by default in all Scrapy requests.
:return: A scrapy settings dictionary to use for crawling web applications.
"""
item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline()
return scrapy.settings.Settings(values={
"CONCURRENT_ITEMS": self.concurrent_items,
"CONCURRENT_REQUESTS": self.concurrent_requests,
"DEFAULT_REQUEST_HEADERS": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
"Host": hostname,
},
"DEPTH_LIMIT": self.depth_limit,
"DEPTH_PRIORITY": self.depth_priority,
"DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory",
"EXTENSIONS": {
"scrapy.extensions.telnet.TelnetConsole": None,
},
"DOWNLOADER_MIDDLEWARES": {
"scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None,
"scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None,
},
"SPIDER_MIDDLEWARES": {
"scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None,
},
"DOWNLOAD_MAXSIZE": self.max_size,
"HTTPERROR_ALLOW_ALL": self.allow_all_errors,
"ITEM_PIPELINES": item_pipeline,
"LOG_LEVEL": config.log_crawling_level,
"TELNETCONSOLE_ENABLED": self.enable_telnet,
"USER_AGENT": self.user_agent,
})
评论列表
文章目录