crawler.py 文件源码-python代码片段

def crawl_endpoint_to_file(
            self,
            ip_address=None,
            port=None,
            hostname=None,
            use_ssl=False,
            use_sni=False,
            start_urls=[],
            in_separate_process=True,
    ):
        """
        Start crawling the given endpoint using the given list of URLs and write the results to
        a local file.
        :param ip_address: The IP address to crawl.
        :param port: The port where the application resides.
        :param hostname: The hostname to submit alongside all requests to the remote endpoint.
        :param use_ssl: Whether or not to use SSL to connect to the remote web service.
        :param use_sni: Whether or not to use SNI to connect to the remote web service.
        :param start_urls: A list of URLs to start crawling from.
        :param in_separate_process: Whether or not to spawn off a separate process for the crawl. This
        enables us to call this method multiple times in the same process, as a Twisted reactor can only
        be started and stopped once per process.
        :return: A tuple containing (1) the string containing the local file path where crawling
        results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file.
        """
        temp_file_path = FilesystemHelper.get_temporary_file_path()
        local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port)
        spider_kwargs = {
            "input_ip_address": ip_address,
            "input_start_urls": start_urls,
            "input_file_path": local_file_path,
            "input_hostname": hostname,
            "input_use_ssl": use_ssl,
            "input_use_sni": use_sni,
            "input_port": port,
        }
        pipeline_settings = self.__get_local_storage_item_pipeline()
        requested_hostname = hostname if hostname is not None else ip_address
        settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname)
        crawling_config = {
            "spider_kwargs": spider_kwargs,
            "settings": settings,
        }
        if in_separate_process:
            process = Process(target=self.__crawl, kwargs=crawling_config)
            process.start()
            process.join()
            process.terminate()
        else:
            self.__crawl(**crawling_config)
        return local_file_path, ScrapyResultWrapper.from_file(local_file_path)