def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
proxies=None, delay=3, max_depth=4, num_retries=2, expires=timedelta(days=30)):
""" Crawl from the given start URL following links matched by link_regex. In the current
implementation, we do not actually scrapy any information.
args:
start_url (str): web site to start crawl
link_regex (str): regex to match for links
kwargs:
robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
user_agent (str): user agent (default: wswp)
proxies (list of dicts): a list of possible dicts for http / https proxies
For formatting, see the requests library
delay (int): seconds to throttle between requests to one domain (default: 3)
max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
num_retries (int): # of retries when 5xx error (default: 2)
expires (timedelta): timedelta for cache expirations (default: 30 days)
"""
crawl_queue = [start_url]
# keep track which URL's have seen before
seen = {}
requests_cache.install_cache(backend='redis', expire_after=expires)
if not robots_url:
robots_url = '{}/robots.txt'.format(start_url)
rp = get_robots_parser(robots_url)
D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies)
while crawl_queue:
url = crawl_queue.pop()
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
depth = seen.get(url, 0)
if depth == max_depth:
print('Skipping %s due to depth' % url)
continue
html = D(url, num_retries=num_retries)
if not html:
continue
# TODO: add actual data scraping here
# filter for links matching our regular expression
for link in get_links(html):
if re.match(link_regex, link):
abs_link = urljoin(start_url, link)
if abs_link not in seen:
seen[abs_link] = depth + 1
crawl_queue.append(abs_link)
else:
print('Blocked by robots.txt:', url)
requests_cache_link_crawler.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录