def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
python类RLock()的实例源码
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, filename, table_name='data', fast_save=False, **options):
"""
:param filename: filename for database (without extension)
:param table_name: table name
:param fast_save: If it's True, then sqlite will be configured with
`"PRAGMA synchronous = 0;" <http://www.sqlite.org/pragma.html#pragma_synchronous>`_
to speedup cache saving, but be careful, it's dangerous.
Tests showed that insertion order of records can be wrong with this option.
"""
self.filename = filename
self.table_name = table_name
self.fast_save = fast_save
#: Transactions can be commited if this property is set to `True`
self.can_commit = True
self.serializer = Serializer()
self._bulk_commit = False
self._pending_connection = None
self._lock = threading.RLock()
with self.connection() as con:
con.execute("create table if not exists `%s` (key PRIMARY KEY, value)" % self.table_name)
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()
def __init__(self, url, timeout=None, num_workers=10, **kwargs):
"""
Initialise an instance.
:param url: The root URL to use for scraping.
:param timeout: The timeout, in seconds, to be applied to requests.
This defaults to ``None`` (no timeout specified).
:param num_workers: The number of worker threads you want to do I/O,
This defaults to 10.
:param kwargs: Passed to the superclass.
"""
super(SimpleScrapingLocator, self).__init__(**kwargs)
self.base_url = ensure_slash(url)
self.timeout = timeout
self._page_cache = {}
self._seen = set()
self._to_fetch = queue.Queue()
self._bad_hosts = set()
self.skip_externals = False
self.num_workers = num_workers
self._lock = threading.RLock()
# See issue #45: we need to be resilient when the locator is used
# in a thread, e.g. with concurrent.futures. We can't use self._lock
# as it is for coordinating our internal threads - the ones created
# in _prepare_threads.
self._gplock = threading.RLock()