def can_access(url):
mod = 'can_access'
answer = "U"
response = None
try:
response = requests.get(url, timeout=5)
current_page = (response.text, 'lxml')
answer = "SL"
except requests.exceptions.ConnectionError:
print("ERROR: Page is inaccessible, return U and move to next case.")
except requests.exceptions.Timeout as e:
print e
except requests.TooManyRedirects as e:
print e
except requests.exceptions.ChunkedEncodingError as e:
print e
except socket.error as e:
print e
return answer, response, mod
python类TooManyRedirects()的实例源码
def can_access(url):
mod = 'can_access'
answer = "U"
response = None
try:
response = requests.get(url, timeout=5)
answer = "SL"
except:
print (sys.exc_info()[0])
"""
except requests.exceptions.ConnectionError as e:
print e
except requests.exceptions.Timeout as e:
print e
except requests.TooManyRedirects as e:
print e
except requests.exceptions.ChunkedEncodingError as e:
print e
except requests.exceptions.ContentDecodingError as e:
print e
except socket.error as e:
print e
"""
return answer, response, mod
def movie_spider(movie_tag):
page_num = 0
movie_list = list()
try_times = 0
while True:
url = 'https://www.douban.com/tag/' + urllib.request.quote(movie_tag) + '/movie?start=' + str(page_num * 15)
time.sleep(numpy.random.rand() * 5) # Hang up the thread to avoid requesting too frequently
try:
req = requests.get(url, headers=User_Agents[page_num % len(User_Agents)], timeout=50)
req.raise_for_status()
req.encoding = req.apparent_encoding
source_code = req.text
plain_text = str(source_code)
except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
print(error)
continue
soup = BeautifulSoup(plain_text, 'lxml')
list_soup = soup.find('div', attrs={'class': 'mod movie-list'})
try_times += 1
if list_soup == None and try_times < 200:
continue
elif list_soup == None or len(list_soup) <= 1:
break # No information returned after 200-time requesting
for movie_info in list_soup.findAll('dd'):
page_parser(movie_info, movie_list)
try_times = 0 # set 0 when got valid information
page_num += 1
print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, movie_tag))
print('Finish Catching Tag -> {0}'.format(movie_tag))
return movie_list
def get_latest_version_infos(url, filename='data/.editolido.cfg.json'):
infos = infos_from_giturl(url)
jsonurl = raw_content_url(url, filename, branch_or_tag=infos['branch'])
logger.info('downloading %s' % jsonurl)
try:
r = requests.get(jsonurl, verify=True, timeout=(3.1, 27))
r.raise_for_status()
data = r.json()
r.close()
except requests.HTTPError:
# noinspection PyUnboundLocalVariable
logger.error('status code %s' % r.status_code)
raise
except requests.Timeout: # pragma no cover
logger.error('download timeout... aborting update')
raise
except requests.ConnectionError: # pragma no cover
logger.error('download connection error... aborting update')
raise
except requests.TooManyRedirects: # pragma no cover
logger.error('too many redirects... aborting update')
raise
except requests.exceptions.RequestException: # pragma no cover
logger.error('download fail... aborting update')
raise
return data
def url_request(self, method_, url_, payloads_, headers_):
"""
@params: method, http method, 'GET' or 'POST', or 'PUT', or 'DELETE', or 'HEAD'
url, url, string, absolute url of resource at goal webserver
payloads, dict or None, extra data to send when visit certain resource
headers, dict or None, customed headers
@return: if http status is 200, this function will return [[data], url, payloads, headers]
if occurs connection error or http status is not 200, this function will return None
"""
try:
header = self.default_headers if headers_ is None else headers_
payloads = urllib.urlencode(payloads_) if payloads_ is not None else None
if payloads is not None:
pass
rqst = self.session.request(method=method_, url=url_, params=payloads, headers=header, timeout=10)
if 'Set-Cookie' in rqst.headers or 'Set-Cookie2' in rqst.headers:
self.session.cookies.save(ignore_discard=True)
if rqst.status_code != 200:
rqst = self.session.request(method=method_, url=url_, params=payloads, headers=header, timeout=10)
if rqst.status_code != 200:
gl.g_fail_url.warning('%s %s'%(url_, str(payloads_)))
return None
return HttpQuint(url_, headers_, payloads_, [rqst.content], rqst.headers)
#return [[rqst.content], method_, url_, payloads_, headers_]
except (requests.HTTPError, requests.Timeout, requests.ConnectionError, requests.TooManyRedirects), e:
tips = '%s when visit %s '%(e, url_) if payloads_ is None else '%s when \
visit %s with data %s'%(e, url_, str(payloads_).decode('unicode_escape'))
self.logger.error(tips)
gl.g_fail_url.warning('%s %s'%(url_, str(payloads_)))
return None
def book_spider(book_tag):
page_num = 0
book_list = list()
try_times = 0
while True:
url = 'https://www.douban.com/tag/' + urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15)
time.sleep(numpy.random.rand() * 5) # Hang up the thread to avoid requesting too frequently
try:
source_code = requests.get(url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text
plain_text = str(source_code)
except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
print(error)
continue
soup = BeautifulSoup(plain_text, 'lxml')
list_soup = soup.find('div', attrs={'class': 'mod book-list'})
try_times += 1
if list_soup == None and try_times < 200:
continue
elif list_soup == None or len(list_soup) <= 1:
break # No information returned after 200-time requesting
for book_info in list_soup.findAll('dd'):
title = book_info.find('a', attrs={'class': 'title'}).string.strip()
desc = book_info.find('div', attrs={'class': 'desc'}).string.strip()
desc_list = desc.split('/')
book_url = book_info.find('a', attrs={'class': 'title'}).get('href')
try:
author_info = '/'.join(desc_list[0:-3])
except:
author_info = ' ??'
try:
pub_info = '/'.join(desc_list[-3:])
except:
pub_info = ' ??'
try:
rating = book_info.find('span', {'class': 'rating_nums'}).string.strip()
except:
rating = '0.0'
book_list.append([title, rating, author_info, pub_info])
try_times = 0 # set 0 when got valid information
page_num += 1
print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag))
print('Finish Catching Tag -> {0}'.format(book_tag))
return book_list
def deco_log(self, log_name, fun_name, check_error=False):
"""
:param fun_name: ????????????
:param log_name: ??????????????,????sys.argv[0][0:-3] + '.log'????py????log
:param check_error: ???????????True?False????False
:return:?????“????”???
"""
# ??py?????????????????????
self.init(log_name)
# ??python2??nonlocal??????????????
status = [1]
msg = [1]
if check_error:
def log(func):
def record(*args, **kwargs):
try:
t0 = time.time()
back = func(*args, **kwargs)
#??
run_time = time.time() - t0
#???????????info?????????
status[0] = 2
msg[0] = "%s?????????%s?" %(fun_name, run_time)
return back
# sys._getframe().f_code.co_name?????????
except IndexError, e:
status[0] = 3
msg[0] = "???:%s???????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
except requests.ConnectionError, e:
status[0] = 4
msg[0] = "???:%s???????????DNS??????????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
except requests.TooManyRedirects, e:
status[0] = 4
msg[0] = "???:%s????????????????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
except requests.HTTPError, e:
status[0] = 4
msg[0] = "???:%s????200?????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
except requests.RequestException, e:
status[0] = 4
msg[0] = "???:%s???requests????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
except Exception, e:
status[0] = 5
msg[0] = "???:%s????????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
finally:
self.fun_log_type(status[0], msg[0])
return record
else:
def log(func):
return func
return log