def follow_redirects(link, sites= None):
"""Follow directs for the link as long as the redirects are on the given
sites and return the resolved link."""
def follow(url):
return sites == None or urlparse.urlparse(url).hostname in sites
class RedirectHandler(urllib2.HTTPRedirectHandler):
def __init__(self):
self.last_url = None
def redirect_request(self, req, fp, code, msg, hdrs, newurl):
self.last_url = newurl
if not follow(newurl):
return None
r = urllib2.HTTPRedirectHandler.redirect_request(
self, req, fp, code, msg, hdrs, newurl)
r.get_method = lambda : 'HEAD'
return r
if not follow(link):
return link
redirect_handler = RedirectHandler()
opener = urllib2.build_opener(redirect_handler)
req = urllib2.Request(link)
req.get_method = lambda : 'HEAD'
try:
with contextlib.closing(opener.open(req,timeout=1)) as site:
return site.url
except:
return redirect_handler.last_url if redirect_handler.last_url else link
python类HTTPRedirectHandler()的实例源码
def follow_redirects(link, sites= None):
"""Follow directs for the link as long as the redirects are on the given
sites and return the resolved link."""
def follow(url):
return sites == None or urlparse.urlparse(url).hostname in sites
class RedirectHandler(urllib2.HTTPRedirectHandler):
def __init__(self):
self.last_url = None
def redirect_request(self, req, fp, code, msg, hdrs, newurl):
self.last_url = newurl
if not follow(newurl):
return None
r = urllib2.HTTPRedirectHandler.redirect_request(
self, req, fp, code, msg, hdrs, newurl)
r.get_method = lambda : 'HEAD'
return r
if not follow(link):
return link
redirect_handler = RedirectHandler()
opener = urllib2.build_opener(redirect_handler)
req = urllib2.Request(link)
req.get_method = lambda : 'HEAD'
try:
with contextlib.closing(opener.open(req,timeout=1)) as site:
return site.url
except:
return redirect_handler.last_url if redirect_handler.last_url else link
def follow_redirects(link, sites= None):
"""Follow directs for the link as long as the redirects are on the given
sites and return the resolved link."""
def follow(url):
return sites == None or urlparse.urlparse(url).hostname in sites
class RedirectHandler(urllib2.HTTPRedirectHandler):
def __init__(self):
self.last_url = None
def redirect_request(self, req, fp, code, msg, hdrs, newurl):
self.last_url = newurl
if not follow(newurl):
return None
r = urllib2.HTTPRedirectHandler.redirect_request(
self, req, fp, code, msg, hdrs, newurl)
r.get_method = lambda : 'HEAD'
return r
if not follow(link):
return link
redirect_handler = RedirectHandler()
opener = urllib2.build_opener(redirect_handler)
req = urllib2.Request(link)
req.get_method = lambda : 'HEAD'
try:
with contextlib.closing(opener.open(req,timeout=1)) as site:
return site.url
except:
return redirect_handler.last_url if redirect_handler.last_url else link
def RedirectHandlerFactory(follow_redirects=None, validate_certs=True):
"""This is a class factory that closes over the value of
``follow_redirects`` so that the RedirectHandler class has access to
that value without having to use globals, and potentially cause problems
where ``open_url`` or ``fetch_url`` are used multiple times in a module.
"""
class RedirectHandler(urllib_request.HTTPRedirectHandler):
"""This is an implementation of a RedirectHandler to match the
functionality provided by httplib2. It will utilize the value of
``follow_redirects`` that is passed into ``RedirectHandlerFactory``
to determine how redirects should be handled in urllib2.
"""
def redirect_request(self, req, fp, code, msg, hdrs, newurl):
handler = maybe_add_ssl_handler(newurl, validate_certs)
if handler:
urllib_request._opener.add_handler(handler)
if follow_redirects == 'urllib2':
return urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl)
elif follow_redirects in ['no', 'none', False]:
raise urllib_error.HTTPError(newurl, code, msg, hdrs, fp)
do_redirect = False
if follow_redirects in ['all', 'yes', True]:
do_redirect = (code >= 300 and code < 400)
elif follow_redirects == 'safe':
m = req.get_method()
do_redirect = (code >= 300 and code < 400 and m in ('GET', 'HEAD'))
if do_redirect:
# be conciliant with URIs containing a space
newurl = newurl.replace(' ', '%20')
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type")
)
return urllib_request.Request(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
else:
raise urllib_error.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
return RedirectHandler
def RedirectHandlerFactory(follow_redirects=None, validate_certs=True):
"""This is a class factory that closes over the value of
``follow_redirects`` so that the RedirectHandler class has access to
that value without having to use globals, and potentially cause problems
where ``open_url`` or ``fetch_url`` are used multiple times in a module.
"""
class RedirectHandler(urllib_request.HTTPRedirectHandler):
"""This is an implementation of a RedirectHandler to match the
functionality provided by httplib2. It will utilize the value of
``follow_redirects`` that is passed into ``RedirectHandlerFactory``
to determine how redirects should be handled in urllib2.
"""
def redirect_request(self, req, fp, code, msg, hdrs, newurl):
handler = maybe_add_ssl_handler(newurl, validate_certs)
if handler:
urllib_request._opener.add_handler(handler)
if follow_redirects == 'urllib2':
return urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl)
elif follow_redirects in ['no', 'none', False]:
raise urllib_error.HTTPError(newurl, code, msg, hdrs, fp)
do_redirect = False
if follow_redirects in ['all', 'yes', True]:
do_redirect = (code >= 300 and code < 400)
elif follow_redirects == 'safe':
m = req.get_method()
do_redirect = (code >= 300 and code < 400 and m in ('GET', 'HEAD'))
if do_redirect:
# be conciliant with URIs containing a space
newurl = newurl.replace(' ', '%20')
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
try:
# Python 2-3.3
origin_req_host = req.get_origin_req_host()
except AttributeError:
# Python 3.4+
origin_req_host = req.origin_req_host
return urllib_request.Request(newurl,
headers=newheaders,
origin_req_host=origin_req_host,
unverifiable=True)
else:
raise urllib_error.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
return RedirectHandler