def _get_html(cls, html, url, html_etree, params, **kwargs):
if html:
html = etree.HTML(html)
elif url:
if not kwargs.get('headers', None):
kwargs['headers'] = {
"User-Agent": get_random_user_agent()
}
response = requests.get(url, params, **kwargs)
response.raise_for_status()
content = response.content
charset = cchardet.detect(content)
text = content.decode(charset['encoding'])
html = etree.HTML(text)
elif html_etree is not None:
return html_etree
else:
raise ValueError("html(url or html_etree) is expected")
return html
python类detect()的实例源码
def encoding_detect(byte_content):
"""
??????????????, ????, ??? None
:param byte_content: ????????
:type byte_content: bytes
:return: ?????None
:rtype: Union[str, None]
"""
if force_decode_remote_using_encode is not None:
return force_decode_remote_using_encode
if possible_charsets:
for charset in possible_charsets:
try:
byte_content.decode(encoding=charset)
except:
pass
else:
return charset
if cchardet_available: # detect the encoding using cchardet (if we have)
return c_chardet(byte_content)['encoding']
return None
def clone_url(url):
"""Get http code of url.
:param url: url to clone
:return:
"""
# get html
if '://' not in url:
url = 'http://' + url
r = requests.get(url)
# We don't trust requests encoding so we use cchardet
# to detect real encoding
# Without it we got decode error (for example: baidu.com)
r.encoding = cchardet.detect(r.content)['encoding']
html = r.content.decode(r.encoding)
# set relative url rule
if '<base' not in html:
html = html.replace('<head>', '<head><base href="%s" />' % url)
return html
def extract(filename,key_part=['# ??']): # ??????????? ???????
# print filename
lines=get_text(filename)
words=[]
texts=''
for key in key_part:
index=lines.index(key)
# print index
words+=lines[index+1].decode('utf-8').split() # ??????
# print cchardet.detect(lines[index+1])
texts+=lines[index+1].decode('utf-8')+' '
words_dict={}
for w in words:
words_dict[w]=words_dict.get(w,0)+1 # dict
return Document(words_dict,filename,texts)
# return words
def extract(lines,filename,key_part=['# ??']): # ??????????? ???????
# print filename
words=[]
for key in key_part:
try:
# print key
# print cchardet.detect(key)
index=lines.index(key)
print index
words+=lines[index+1].decode('utf-8').split() # ??????
except Exception,e:
print e
words_dict={}
for w in words:
words_dict[w]=words_dict.get(w,0)+1 # dict
# return words
return Document(words_dict,filename)
def requests_target_fetch(url):
"""
:param url:
:return:
"""
try:
headers = {'user-agent': get_random_user_agent()}
response = requests.get(url=url, headers=headers, verify=False)
response.raise_for_status()
content = response.content
charset = cchardet.detect(content)
text = content.decode(charset['encoding'])
return text
except Exception as e:
LOGGER.exception(e)
return None
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1