def crawl_endpoint_to_file(
self,
ip_address=None,
port=None,
hostname=None,
use_ssl=False,
use_sni=False,
start_urls=[],
in_separate_process=True,
):
"""
Start crawling the given endpoint using the given list of URLs and write the results to
a local file.
:param ip_address: The IP address to crawl.
:param port: The port where the application resides.
:param hostname: The hostname to submit alongside all requests to the remote endpoint.
:param use_ssl: Whether or not to use SSL to connect to the remote web service.
:param use_sni: Whether or not to use SNI to connect to the remote web service.
:param start_urls: A list of URLs to start crawling from.
:param in_separate_process: Whether or not to spawn off a separate process for the crawl. This
enables us to call this method multiple times in the same process, as a Twisted reactor can only
be started and stopped once per process.
:return: A tuple containing (1) the string containing the local file path where crawling
results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file.
"""
temp_file_path = FilesystemHelper.get_temporary_file_path()
local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port)
spider_kwargs = {
"input_ip_address": ip_address,
"input_start_urls": start_urls,
"input_file_path": local_file_path,
"input_hostname": hostname,
"input_use_ssl": use_ssl,
"input_use_sni": use_sni,
"input_port": port,
}
pipeline_settings = self.__get_local_storage_item_pipeline()
requested_hostname = hostname if hostname is not None else ip_address
settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname)
crawling_config = {
"spider_kwargs": spider_kwargs,
"settings": settings,
}
if in_separate_process:
process = Process(target=self.__crawl, kwargs=crawling_config)
process.start()
process.join()
process.terminate()
else:
self.__crawl(**crawling_config)
return local_file_path, ScrapyResultWrapper.from_file(local_file_path)
评论列表
文章目录