def crawl(self, url, base_url):
"""Crawl .html page and extract all URls we think are part of application from there.
Parallerize downloads using threads.
"""
resp = requests.get(url)
# See through redirects
final_base_url = resp.url
tree = lxml.html.fromstring(resp.content)
elems = tree.cssselect("a")
links = [urljoin(final_base_url, elem.attrib.get("href", "")) for elem in elems]
links = [link for link in links if is_likely_app_part(link, base_url)]
# Load all links paraller
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(self.fetch_file, link, base_url): link for link in links}
for future in concurrent.futures.as_completed(future_to_url):
future.result() # Raise exception in main thread if bad stuff happened
评论列表
文章目录