def __init__(self, url, search_terms=None, *args, **kwargs):
if url.startswith('.') or url.startswith('/'):
with Path(url).open('rt', encoding='utf8') as f:
urls = [line.strip() for line in f]
else:
urls = [u for u in url.split() if u]
self.start_urls = [add_http_if_no_scheme(_url) for _url in urls]
self.search_terms = search_terms
self._extra_search_terms = None # lazy-loaded via extra_search_terms
self._reset_link_extractors()
self.images_link_extractor = LinkExtractor(
tags=['img'], attrs=['src'], deny_extensions=[],
canonicalize=False)
self.state = {}
self.use_splash = None # set up in start_requests
self._screenshot_dest = None # type: Path
# Load headless horseman scripts
self.lua_source = load_directive('headless_horseman.lua')
self.js_source = load_directive('headless_horseman.js')
super().__init__(*args, **kwargs)
评论列表
文章目录