def procdata_getencoding(seed,headers,content):
code = utils.get_encoding_from_headers(headers)
if code:
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
elif code.lower() == 'utf-8':
code = 'utf-8'
else:
code = None
if code == None:
code = utils.get_encodings_from_content(content)
print "content",seed,code
if code:
code = code[0]
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
return code
python类get_encoding_from_headers()的实例源码
def procdata_getencoding(seed,headers,content):
code = utils.get_encoding_from_headers(headers)
if code:
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
elif code.lower() == 'utf-8':
code = 'utf-8'
else:
code = None
if code == None:
code = utils.get_encodings_from_content(content)
print "content",seed,code
if code:
code = code[0]
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
return code
def procdata_getencoding(seed,headers,content):
code = utils.get_encoding_from_headers(headers)
if code:
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
elif code.lower() == 'utf-8':
code = 'utf-8'
else:
code = None
if code == None:
code = utils.get_encodings_from_content(content)
print "content",seed,code
if code:
code = code[0]
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
return code
def make_response(status_code: int = 200,
content: bytes = b'',
headers: dict = None,
reason: str = None,
encoding: str = None,
) -> Response:
response = Response()
response.status_code = status_code
response._content = content
response._content_consumed = True
response.headers = CaseInsensitiveDict(headers or {})
response.encoding = encoding or get_encoding_from_headers(headers or {})
response.reason = reason
return response
def encoding(self):
"""
encoding of Response.content.
if Response.encoding is None, encoding will be guessed
by header or content or chardet if available.
"""
if hasattr(self, '_encoding'):
return self._encoding
# content is unicode
if isinstance(self.content, six.text_type):
return 'unicode'
# Try charset from content-type
encoding = get_encoding_from_headers(self.headers)
if encoding == 'ISO-8859-1':
encoding = None
# Try charset from content
if not encoding and get_encodings_from_content:
if six.PY3:
encoding = get_encodings_from_content(utils.pretty_unicode(self.content[:100]))
else:
encoding = get_encodings_from_content(self.content)
encoding = encoding and encoding[0] or None
# Fallback to auto-detected encoding.
if not encoding and chardet is not None:
encoding = chardet.detect(self.content)['encoding']
if encoding and encoding.lower() == 'gb2312':
encoding = 'gb18030'
self._encoding = encoding or 'utf-8'
return self._encoding
def filter_encoding(self,seed, headers,content):
code = utils.get_encoding_from_headers(headers)
if code:
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
return True
elif code.lower() == 'utf-8' or code.lower() == 'utf8':
code = 'utf8'
# as for utf8, we should check the content
else: # 'ISO-8859-1' and so on,
code = None
# chinese website may also miss the content-encoding header, so detect the content
if code == None:
codes = utils.get_encodings_from_content(content)
if codes:
for code in codes:
if code.lower() in [ 'gbk','gb2312']:
return True
elif code.lower() == 'utf8' or code.lower() == 'utf-8':
code = 'utf8'
break
if code != 'utf8':
return False
# here handle utf8
# to detect any chinese char win
try:
ucon = content.decode('utf8')
for uchar in ucon:
i = ord(uchar)
if i >= 0x4e00 and i <= 0x9fa5:
return True
except Exception, e:
print url, e
pass
return False
def get_unicode_from_response(response):
"""Return the requested content back in unicode.
This will first attempt to retrieve the encoding from the response
headers. If that fails, it will use
:func:`requests_toolbelt.utils.deprecated.get_encodings_from_content`
to determine encodings from HTML elements.
.. code-block:: python
import requests
from requests_toolbelt.utils import deprecated
r = requests.get(url)
text = deprecated.get_unicode_from_response(r)
:param response: Response object to get unicode content from.
:type response: requests.models.Response
"""
tried_encodings = set()
# Try charset from content-type
encoding = utils.get_encoding_from_headers(response.headers)
if encoding:
try:
return str(response.content, encoding)
except UnicodeError:
tried_encodings.add(encoding.lower())
encodings = get_encodings_from_content(response.content)
for _encoding in encodings:
_encoding = _encoding.lower()
if _encoding in tried_encodings:
continue
try:
return str(response.content, _encoding)
except UnicodeError:
tried_encodings.add(_encoding)
# Fall back:
if encoding:
try:
return str(response.content, encoding, errors='replace')
except TypeError:
pass
return response.text
def get_unicode_from_response(response):
"""Return the requested content back in unicode.
This will first attempt to retrieve the encoding from the response
headers. If that fails, it will use
:func:`requests_toolbelt.utils.deprecated.get_encodings_from_content`
to determine encodings from HTML elements.
.. code-block:: python
import requests
from requests_toolbelt.utils import deprecated
r = requests.get(url)
text = deprecated.get_unicode_from_response(r)
:param response: Response object to get unicode content from.
:type response: requests.models.Response
"""
tried_encodings = set()
# Try charset from content-type
encoding = utils.get_encoding_from_headers(response.headers)
if encoding:
try:
return str(response.content, encoding)
except UnicodeError:
tried_encodings.add(encoding.lower())
encodings = get_encodings_from_content(response.content)
for _encoding in encodings:
_encoding = _encoding.lower()
if _encoding in tried_encodings:
continue
try:
return str(response.content, _encoding)
except UnicodeError:
tried_encodings.add(_encoding)
# Fall back:
if encoding:
try:
return str(response.content, encoding, errors='replace')
except TypeError:
pass
return response.text
def get_unicode_from_response(response):
"""Return the requested content back in unicode.
This will first attempt to retrieve the encoding from the response
headers. If that fails, it will use
:func:`requests_toolbelt.utils.deprecated.get_encodings_from_content`
to determine encodings from HTML elements.
.. code-block:: python
import requests
from requests_toolbelt.utils import deprecated
r = requests.get(url)
text = deprecated.get_unicode_from_response(r)
:param response: Response object to get unicode content from.
:type response: requests.models.Response
"""
tried_encodings = set()
# Try charset from content-type
encoding = utils.get_encoding_from_headers(response.headers)
if encoding:
try:
return str(response.content, encoding)
except UnicodeError:
tried_encodings.add(encoding.lower())
encodings = get_encodings_from_content(response.content)
for _encoding in encodings:
_encoding = _encoding.lower()
if _encoding in tried_encodings:
continue
try:
return str(response.content, _encoding)
except UnicodeError:
tried_encodings.add(_encoding)
# Fall back:
if encoding:
try:
return str(response.content, encoding, errors='replace')
except TypeError:
pass
return response.text