def handle_form(self, url, form, meta):
action = canonicalize_url(urljoin(url, form.action))
if not self.link_extractor.matches(action):
return
if (meta['form'] == 'search' and
self.settings.getbool('CRAZY_SEARCH_ENABLED') and
action not in self.handled_search_forms and
len(self.handled_search_forms) <
self.settings.getint('MAX_DOMAIN_SEARCH_FORMS')):
self.logger.debug('Found a search form at %s', url)
self.handled_search_forms.add(action)
for request_kwargs in search_form_requests(
url, form, meta,
search_terms=self.search_terms,
extra_search_terms=self.extra_search_terms):
request_kwargs['meta'] = {'is_search': True}
request_kwargs['cls'] = \
SplashFormRequest if self.use_splash else FormRequest
yield request_kwargs
python类settings()的实例源码
def main():
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# definir el spider para el crawler
crawler.crawl(EuropythonSpyder())
# iniciar scrapy
print "STARTING ENGINE"
crawler.start() #iniciar el crawler llamando al spider definido
print "ENGINE STOPPED"
def main():
from scrapy.xlib.pydispatch import dispatcher
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define spyder for the crawler
crawler.crawl(PydataSpiderDetails())
print "STARTING ENGINE"
crawler.start() #start the crawler
print "ENGINE STOPPED"
def make_crawler(**extra_settings):
settings = Settings()
settings['ITEM_PIPELINES'] = {
'scrapy_cdr.media_pipeline.CDRMediaPipeline': 1,
'tests.utils.CollectorPipeline': 100,
}
settings.update(extra_settings)
runner = CrawlerRunner(settings)
return runner.create_crawler(Spider)
def start_requests(self):
self.use_splash = using_splash(self.settings)
for url in self.start_urls:
yield self.make_request(url, callback=self.parse_first)
def make_request(
self, url, callback=None, meta=None, cls=None, **kwargs):
callback = callback or self.parse
cls = cls or (SplashRequest if self.use_splash else Request)
if self.use_splash:
settings = self.settings
splash_args = {
'lua_source': self.lua_source,
'js_source': self.js_source,
'run_hh': settings.getbool('RUN_HH'),
'return_png': settings.getbool('SCREENSHOT'),
'images_enabled': settings.getbool('IMAGES_ENABLED'),
}
for s in ['VIEWPORT_WIDTH', 'VIEWPORT_HEIGHT',
'SCREENSHOT_WIDTH', 'SCREENSHOT_HEIGHT']:
if self.settings.get(s):
splash_args[s.lower()] = self.settings.getint(s)
if self.settings.getbool('ADBLOCK'):
splash_args['filters'] = 'fanboy-annoyance,easylist'
if self.settings.getbool('FORCE_TOR'):
splash_args['proxy'] = 'tor'
kwargs.update(dict(
args=splash_args,
endpoint='execute',
cache_args=['lua_source', 'js_source'],
))
meta = meta or {}
meta['avoid_dup_content'] = True
return cls(url, callback=callback, meta=meta, **kwargs)
def parse_first(self, response):
allowed = allowed_re(
response.url, self.settings.getbool('HARD_URL_CONSTRAINT'))
if allowed not in self.allowed:
self.allowed.add(allowed)
# Reset link extractors to pick up with the latest self.allowed regexps
self._reset_link_extractors()
self.logger.info('Updated allowed regexps: %s', self.allowed)
yield from self.parse(response)
def text_cdr_item(self, response, *, follow_urls, metadata):
if self.settings.get('FILES_STORE'):
media_urls = self.media_urls(response, follow_urls)
else:
media_urls = []
return text_cdr_item(
response,
crawler_name=self.settings.get('CDR_CRAWLER'),
team_name=self.settings.get('CDR_TEAM'),
# will be downloaded by UndercrawlerMediaPipeline
objects=media_urls,
metadata=metadata,
)
def _looks_like_logout(self, link, response):
if not self.settings.getbool('AUTOLOGIN_ENABLED') or not \
response.meta.get('autologin_active'):
return False
return link_looks_like_logout(link)
def _take_screenshot(self, response) -> Optional[str]:
screenshot = response.data.get('png') if self.use_splash else None
if not screenshot:
return None
if self._screenshot_dest is None:
self._screenshot_dest = Path(
self.settings.get('SCREENSHOT_DEST', 'screenshots'))
self._screenshot_dest.mkdir(parents=True, exist_ok=True)
path = self._screenshot_dest.joinpath(
'{prefix}{uuid}.png'.format(
prefix=self.settings.get('SCREENSHOT_PREFIX', ''),
uuid=uuid.uuid4()))
path.write_bytes(b64decode(screenshot))
self.logger.debug('Saved %s screenshot to %s' % (response, path))
return str(path)
def get_scrapy_settings(self, item_pipeline=None, hostname=None):
"""
Get a scrapy settings dictionary to use for crawling web applications.
:param item_pipeline: The item pipeline configuration to configure in the settings.
:param hostname: The hostname to request by default in all Scrapy requests.
:return: A scrapy settings dictionary to use for crawling web applications.
"""
item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline()
return scrapy.settings.Settings(values={
"CONCURRENT_ITEMS": self.concurrent_items,
"CONCURRENT_REQUESTS": self.concurrent_requests,
"DEFAULT_REQUEST_HEADERS": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
"Host": hostname,
},
"DEPTH_LIMIT": self.depth_limit,
"DEPTH_PRIORITY": self.depth_priority,
"DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory",
"EXTENSIONS": {
"scrapy.extensions.telnet.TelnetConsole": None,
},
"DOWNLOADER_MIDDLEWARES": {
"scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None,
"scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None,
},
"SPIDER_MIDDLEWARES": {
"scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None,
},
"DOWNLOAD_MAXSIZE": self.max_size,
"HTTPERROR_ALLOW_ALL": self.allow_all_errors,
"ITEM_PIPELINES": item_pipeline,
"LOG_LEVEL": config.log_crawling_level,
"TELNETCONSOLE_ENABLED": self.enable_telnet,
"USER_AGENT": self.user_agent,
})
def __crawl(self, spider_kwargs=None, settings=None):
"""
Perform a crawl based on the contents of self._crawling_config.
:param spider_kwargs: Keyword arguments to use to create a spider class.
:param settings: Scrapy settings to use to crawl the remote endpoint.
:return: None
"""
print("SPIDER KWARGS ARE %s." % (spider_kwargs,))
config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"]
spider = self.get_spider_class_for_domain(**spider_kwargs)
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()
def crawling_config(self):
"""
Get a dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
:return: A dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
"""
return self._crawling_config
# Representation and Comparison
def crawl_endpoint_to_file(
self,
ip_address=None,
port=None,
hostname=None,
use_ssl=False,
use_sni=False,
start_urls=[],
in_separate_process=True,
):
"""
Start crawling the given endpoint using the given list of URLs and write the results to
a local file.
:param ip_address: The IP address to crawl.
:param port: The port where the application resides.
:param hostname: The hostname to submit alongside all requests to the remote endpoint.
:param use_ssl: Whether or not to use SSL to connect to the remote web service.
:param use_sni: Whether or not to use SNI to connect to the remote web service.
:param start_urls: A list of URLs to start crawling from.
:param in_separate_process: Whether or not to spawn off a separate process for the crawl. This
enables us to call this method multiple times in the same process, as a Twisted reactor can only
be started and stopped once per process.
:return: A tuple containing (1) the string containing the local file path where crawling
results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file.
"""
temp_file_path = FilesystemHelper.get_temporary_file_path()
local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port)
spider_kwargs = {
"input_ip_address": ip_address,
"input_start_urls": start_urls,
"input_file_path": local_file_path,
"input_hostname": hostname,
"input_use_ssl": use_ssl,
"input_use_sni": use_sni,
"input_port": port,
}
pipeline_settings = self.__get_local_storage_item_pipeline()
requested_hostname = hostname if hostname is not None else ip_address
settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname)
crawling_config = {
"spider_kwargs": spider_kwargs,
"settings": settings,
}
if in_separate_process:
process = Process(target=self.__crawl, kwargs=crawling_config)
process.start()
process.join()
process.terminate()
else:
self.__crawl(**crawling_config)
return local_file_path, ScrapyResultWrapper.from_file(local_file_path)