python类TooManyRedirects()的实例源码

can_access.py 文件源码 项目:phishing-detection 作者: mjkim610 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def can_access(url):
    mod = 'can_access'
    answer = "U"
    response = None
    try:
        response = requests.get(url, timeout=5)
        current_page = (response.text, 'lxml')
        answer = "SL"
    except requests.exceptions.ConnectionError:
        print("ERROR: Page is inaccessible, return U and move to next case.")
    except requests.exceptions.Timeout as e:
        print e
    except requests.TooManyRedirects as e:
        print e
    except requests.exceptions.ChunkedEncodingError as e:
        print e
    except socket.error as e:
        print e
    return answer, response, mod
can_access.py 文件源码 项目:phishing-detection 作者: mjkim610 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def can_access(url):
    mod = 'can_access'
    answer = "U"
    response = None
    try:
        response = requests.get(url, timeout=5)
        answer = "SL"
    except:
        print (sys.exc_info()[0])

    """
    except requests.exceptions.ConnectionError as e:
        print e
    except requests.exceptions.Timeout as e:
        print e
    except requests.TooManyRedirects as e:
        print e
    except requests.exceptions.ChunkedEncodingError as e:
        print e
    except requests.exceptions.ContentDecodingError as e:
        print e
    except socket.error as e:
        print e
    """
    return answer, response, mod
DouBanMovie.py 文件源码 项目:DouBanCrawls 作者: SimonCqk 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def movie_spider(movie_tag):
    page_num = 0
    movie_list = list()
    try_times = 0
    while True:
        url = 'https://www.douban.com/tag/' + urllib.request.quote(movie_tag) + '/movie?start=' + str(page_num * 15)
        time.sleep(numpy.random.rand() * 5)  # Hang up the thread to avoid requesting too frequently
        try:
            req = requests.get(url, headers=User_Agents[page_num % len(User_Agents)], timeout=50)
            req.raise_for_status()
            req.encoding = req.apparent_encoding
            source_code = req.text
            plain_text = str(source_code)
        except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
            print(error)
            continue

        soup = BeautifulSoup(plain_text, 'lxml')
        list_soup = soup.find('div', attrs={'class': 'mod movie-list'})
        try_times += 1
        if list_soup == None and try_times < 200:
            continue
        elif list_soup == None or len(list_soup) <= 1:
            break  # No information returned after 200-time requesting

        for movie_info in list_soup.findAll('dd'):
            page_parser(movie_info, movie_list)
            try_times = 0  # set 0 when got valid information
        page_num += 1
        print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, movie_tag))
    print('Finish Catching Tag -> {0}'.format(movie_tag))
    return movie_list
core.py 文件源码 项目:editolido 作者: flyingeek 项目源码 文件源码 阅读 44 收藏 0 点赞 0 评论 0
def get_latest_version_infos(url, filename='data/.editolido.cfg.json'):
    infos = infos_from_giturl(url)
    jsonurl = raw_content_url(url, filename, branch_or_tag=infos['branch'])
    logger.info('downloading %s' % jsonurl)
    try:
        r = requests.get(jsonurl, verify=True, timeout=(3.1, 27))
        r.raise_for_status()
        data = r.json()
        r.close()
    except requests.HTTPError:
        # noinspection PyUnboundLocalVariable
        logger.error('status code %s' % r.status_code)
        raise
    except requests.Timeout:  # pragma no cover
        logger.error('download timeout... aborting update')
        raise
    except requests.ConnectionError:  # pragma no cover
        logger.error('download connection error... aborting update')
        raise
    except requests.TooManyRedirects:  # pragma no cover
        logger.error('too many redirects... aborting update')
        raise
    except requests.exceptions.RequestException:  # pragma no cover
        logger.error('download fail... aborting update')
        raise
    return data
HttpClient.py 文件源码 项目:ZhihuScrapy 作者: wcsjtu 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def url_request(self, method_, url_, payloads_, headers_):
        """
        @params: method, http method, 'GET' or 'POST', or 'PUT', or 'DELETE', or 'HEAD'
                 url, url, string, absolute url of resource at goal webserver
                 payloads, dict or None, extra data to send when visit certain resource
                 headers, dict  or None, customed headers
        @return: if http status is 200, this function will return [[data], url, payloads, headers]
                 if occurs connection error or http status is not 200, this function will return None
        """
        try:
            header = self.default_headers if headers_ is None else headers_
            payloads = urllib.urlencode(payloads_) if payloads_ is not None else None
            if payloads is not None:
                pass
            rqst = self.session.request(method=method_, url=url_, params=payloads, headers=header, timeout=10) 
            if 'Set-Cookie' in rqst.headers or 'Set-Cookie2' in rqst.headers:
                self.session.cookies.save(ignore_discard=True)
            if rqst.status_code != 200:
                rqst = self.session.request(method=method_, url=url_, params=payloads, headers=header, timeout=10)
                if rqst.status_code != 200:
                    gl.g_fail_url.warning('%s %s'%(url_, str(payloads_)))
                    return None
            return HttpQuint(url_, headers_, payloads_, [rqst.content], rqst.headers)
            #return [[rqst.content], method_, url_, payloads_, headers_]

        except (requests.HTTPError, requests.Timeout, requests.ConnectionError, requests.TooManyRedirects), e: 
            tips = '%s when visit %s '%(e, url_) if payloads_ is None else '%s when \
                    visit %s with data %s'%(e, url_, str(payloads_).decode('unicode_escape'))
            self.logger.error(tips)
            gl.g_fail_url.warning('%s %s'%(url_, str(payloads_)))
            return None
DouBanReading.py 文件源码 项目:DouBanCrawls 作者: SimonCqk 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def book_spider(book_tag):
    page_num = 0
    book_list = list()
    try_times = 0
    while True:
        url = 'https://www.douban.com/tag/' + urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15)
        time.sleep(numpy.random.rand() * 5)  # Hang up the thread to avoid requesting too frequently
        try:
            source_code = requests.get(url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text
            plain_text = str(source_code)
        except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
            print(error)
            continue

        soup = BeautifulSoup(plain_text, 'lxml')
        list_soup = soup.find('div', attrs={'class': 'mod book-list'})
        try_times += 1
        if list_soup == None and try_times < 200:
            continue
        elif list_soup == None or len(list_soup) <= 1:
            break  # No information returned after 200-time requesting

        for book_info in list_soup.findAll('dd'):
            title = book_info.find('a', attrs={'class': 'title'}).string.strip()
            desc = book_info.find('div', attrs={'class': 'desc'}).string.strip()
            desc_list = desc.split('/')
            book_url = book_info.find('a', attrs={'class': 'title'}).get('href')
            try:
                author_info = '/'.join(desc_list[0:-3])
            except:
                author_info = ' ??'
            try:
                pub_info = '/'.join(desc_list[-3:])
            except:
                pub_info = ' ??'
            try:
                rating = book_info.find('span', {'class': 'rating_nums'}).string.strip()
            except:
                rating = '0.0'

            book_list.append([title, rating, author_info, pub_info])
            try_times = 0  # set 0 when got valid information
        page_num += 1
        print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag))
    print('Finish Catching Tag -> {0}'.format(book_tag))
    return book_list
mylog.py 文件源码 项目:SpiderConfig 作者: brady-chen 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def deco_log(self, log_name, fun_name, check_error=False):
        """
        :param fun_name: ????????????
        :param log_name: ??????????????,????sys.argv[0][0:-3] + '.log'????py????log
        :param check_error: ???????????True?False????False
        :return:?????“????”???
        """
        # ??py?????????????????????
        self.init(log_name)
        # ??python2??nonlocal??????????????
        status = [1]
        msg = [1]
        if check_error:
            def log(func):
                def record(*args, **kwargs):
                    try:
                        t0 = time.time()
                        back = func(*args, **kwargs)
                        #??
                        run_time = time.time() - t0
                        #???????????info?????????
                        status[0] = 2
                        msg[0] = "%s?????????%s?" %(fun_name, run_time)
                        return back
                    # sys._getframe().f_code.co_name?????????
                    except IndexError, e:
                        status[0] = 3
                        msg[0] = "???:%s???????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
                    except requests.ConnectionError, e:
                        status[0] = 4
                        msg[0] = "???:%s???????????DNS??????????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
                    except requests.TooManyRedirects, e:
                        status[0] = 4
                        msg[0] = "???:%s????????????????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
                    except requests.HTTPError, e:
                        status[0] = 4
                        msg[0] = "???:%s????200?????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
                    except requests.RequestException, e:
                        status[0] = 4
                        msg[0] = "???:%s???requests????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
                    except Exception, e:
                        status[0] = 5
                        msg[0] = "???:%s????????\n???:%s\n?????:\n%s" % (fun_name, e, traceback.format_exc())
                    finally:
                        self.fun_log_type(status[0], msg[0])
                return record
        else:
            def log(func):
                return func
        return log


问题


面经


文章

微信
公众号

扫码关注公众号