def download(url, user_agent='wswp', num_retries=2, charset='utf-8', proxy=None):
""" Download a given URL and return the page content
args:
url (str): URL
kwargs:
user_agent (str): user agent (default: wswp)
charset (str): charset if website does not include one in headers
proxy (str): proxy url, ex 'http://IP' (default: None)
num_retries (int): number of retries if a 5xx error is seen (default: 2)
"""
print('Downloading:', url)
request = urllib.request.Request(url)
request.add_header('User-agent', user_agent)
try:
if proxy:
proxy_support = urllib.request.ProxyHandler({'http': proxy})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
resp = urllib.request.urlopen(request)
cs = resp.headers.get_content_charset()
if not cs:
cs = charset
html = resp.read().decode(cs)
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
advanced_link_crawler.py 文件源码
python
阅读 44
收藏 0
点赞 0
评论 0
评论列表
文章目录