python类get_encoding_from_headers()的实例源码

light_daemon_maindomain.py 文件源码 项目:crawler_old 作者: salmonx 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def procdata_getencoding(seed,headers,content):

    code = utils.get_encoding_from_headers(headers)
    if code:
        if code.lower() == 'gbk' or code.lower() == 'gb2312':
            code = 'gbk'
        elif code.lower() == 'utf-8':
            code = 'utf-8'
        else:
            code = None

    if code == None:
        code = utils.get_encodings_from_content(content)
        print "content",seed,code
        if code:
            code = code[0]
            if code.lower() == 'gbk' or code.lower() == 'gb2312':
                code = 'gbk'

    return code
daemon.py 文件源码 项目:crawler_old 作者: salmonx 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def procdata_getencoding(seed,headers,content):

    code = utils.get_encoding_from_headers(headers)
    if code:
        if code.lower() == 'gbk' or code.lower() == 'gb2312':
            code = 'gbk'
        elif code.lower() == 'utf-8':
            code = 'utf-8'
        else:
            code = None

    if code == None:
        code = utils.get_encodings_from_content(content)
        print "content",seed,code
        if code:
            code = code[0]
            if code.lower() == 'gbk' or code.lower() == 'gb2312':
                code = 'gbk'

    return code
daemon.py 文件源码 项目:crawler_old 作者: salmonx 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def procdata_getencoding(seed,headers,content):

    code = utils.get_encoding_from_headers(headers)
    if code:
        if code.lower() == 'gbk' or code.lower() == 'gb2312':
            code = 'gbk'
        elif code.lower() == 'utf-8':
            code = 'utf-8'
        else:
            code = None

    if code == None:
        code = utils.get_encodings_from_content(content)
        print "content",seed,code
        if code:
            code = code[0]
            if code.lower() == 'gbk' or code.lower() == 'gb2312':
                code = 'gbk'

    return code
utils.py 文件源码 项目:rets 作者: opendoor-labs 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def make_response(status_code: int = 200,
                  content: bytes = b'',
                  headers: dict = None,
                  reason: str = None,
                  encoding: str = None,
                  ) -> Response:
    response = Response()
    response.status_code = status_code
    response._content = content
    response._content_consumed = True
    response.headers = CaseInsensitiveDict(headers or {})
    response.encoding = encoding or get_encoding_from_headers(headers or {})
    response.reason = reason
    return response
response.py 文件源码 项目:fulmar 作者: tylderen 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def encoding(self):
        """
        encoding of Response.content.

        if Response.encoding is None, encoding will be guessed
        by header or content or chardet if available.
        """
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, six.text_type):
            return 'unicode'

        # Try charset from content-type
        encoding = get_encoding_from_headers(self.headers)
        if encoding == 'ISO-8859-1':
            encoding = None

        # Try charset from content
        if not encoding and get_encodings_from_content:
            if six.PY3:
                encoding = get_encodings_from_content(utils.pretty_unicode(self.content[:100]))
            else:
                encoding = get_encodings_from_content(self.content)
            encoding = encoding and encoding[0] or None

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content)['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding
worker_filter.py 文件源码 项目:crawler_old 作者: salmonx 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def filter_encoding(self,seed, headers,content):

        code = utils.get_encoding_from_headers(headers)
        if code:
            if code.lower() == 'gbk' or code.lower() == 'gb2312':
                code = 'gbk'
                return True
            elif code.lower() == 'utf-8' or code.lower() == 'utf8':
                code = 'utf8'
                # as for utf8, we should check the content
            else: #  'ISO-8859-1' and so on, 
                code = None

        # chinese website may also miss the content-encoding header, so detect the content
        if code == None:
            codes = utils.get_encodings_from_content(content)
            if codes:
                for code in codes:
                    if code.lower() in [ 'gbk','gb2312']:
                        return True
                    elif code.lower() == 'utf8' or code.lower() == 'utf-8':
                        code = 'utf8'
                        break

        if code != 'utf8':
            return False

        # here handle utf8
        # to detect any chinese char win
        try:
            ucon = content.decode('utf8')
            for uchar in ucon:
                i = ord(uchar)
                if i >= 0x4e00 and i <= 0x9fa5:
                    return True
        except Exception, e:
            print url, e
            pass
        return False
deprecated.py 文件源码 项目:flickr_downloader 作者: Denisolt 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def get_unicode_from_response(response):
    """Return the requested content back in unicode.

    This will first attempt to retrieve the encoding from the response
    headers. If that fails, it will use
    :func:`requests_toolbelt.utils.deprecated.get_encodings_from_content`
    to determine encodings from HTML elements.

    .. code-block:: python

        import requests
        from requests_toolbelt.utils import deprecated

        r = requests.get(url)
        text = deprecated.get_unicode_from_response(r)

    :param response: Response object to get unicode content from.
    :type response: requests.models.Response
    """
    tried_encodings = set()

    # Try charset from content-type
    encoding = utils.get_encoding_from_headers(response.headers)

    if encoding:
        try:
            return str(response.content, encoding)
        except UnicodeError:
            tried_encodings.add(encoding.lower())

    encodings = get_encodings_from_content(response.content)

    for _encoding in encodings:
        _encoding = _encoding.lower()
        if _encoding in tried_encodings:
            continue
        try:
            return str(response.content, _encoding)
        except UnicodeError:
            tried_encodings.add(_encoding)

    # Fall back:
    if encoding:
        try:
            return str(response.content, encoding, errors='replace')
        except TypeError:
            pass
    return response.text
deprecated.py 文件源码 项目:Liljimbo-Chatbot 作者: chrisjim316 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def get_unicode_from_response(response):
    """Return the requested content back in unicode.

    This will first attempt to retrieve the encoding from the response
    headers. If that fails, it will use
    :func:`requests_toolbelt.utils.deprecated.get_encodings_from_content`
    to determine encodings from HTML elements.

    .. code-block:: python

        import requests
        from requests_toolbelt.utils import deprecated

        r = requests.get(url)
        text = deprecated.get_unicode_from_response(r)

    :param response: Response object to get unicode content from.
    :type response: requests.models.Response
    """
    tried_encodings = set()

    # Try charset from content-type
    encoding = utils.get_encoding_from_headers(response.headers)

    if encoding:
        try:
            return str(response.content, encoding)
        except UnicodeError:
            tried_encodings.add(encoding.lower())

    encodings = get_encodings_from_content(response.content)

    for _encoding in encodings:
        _encoding = _encoding.lower()
        if _encoding in tried_encodings:
            continue
        try:
            return str(response.content, _encoding)
        except UnicodeError:
            tried_encodings.add(_encoding)

    # Fall back:
    if encoding:
        try:
            return str(response.content, encoding, errors='replace')
        except TypeError:
            pass
    return response.text
deprecated.py 文件源码 项目:OctoFusion 作者: tapnair 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_unicode_from_response(response):
    """Return the requested content back in unicode.

    This will first attempt to retrieve the encoding from the response
    headers. If that fails, it will use
    :func:`requests_toolbelt.utils.deprecated.get_encodings_from_content`
    to determine encodings from HTML elements.

    .. code-block:: python

        import requests
        from requests_toolbelt.utils import deprecated

        r = requests.get(url)
        text = deprecated.get_unicode_from_response(r)

    :param response: Response object to get unicode content from.
    :type response: requests.models.Response
    """
    tried_encodings = set()

    # Try charset from content-type
    encoding = utils.get_encoding_from_headers(response.headers)

    if encoding:
        try:
            return str(response.content, encoding)
        except UnicodeError:
            tried_encodings.add(encoding.lower())

    encodings = get_encodings_from_content(response.content)

    for _encoding in encodings:
        _encoding = _encoding.lower()
        if _encoding in tried_encodings:
            continue
        try:
            return str(response.content, _encoding)
        except UnicodeError:
            tried_encodings.add(_encoding)

    # Fall back:
    if encoding:
        try:
            return str(response.content, encoding, errors='replace')
        except TypeError:
            pass
    return response.text


问题


面经


文章

微信
公众号

扫码关注公众号