def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
评论列表
文章目录