def filter_encoding(self,seed, headers,content):
code = utils.get_encoding_from_headers(headers)
if code:
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
return True
elif code.lower() == 'utf-8' or code.lower() == 'utf8':
code = 'utf8'
# as for utf8, we should check the content
else: # 'ISO-8859-1' and so on,
code = None
# chinese website may also miss the content-encoding header, so detect the content
if code == None:
codes = utils.get_encodings_from_content(content)
if codes:
for code in codes:
if code.lower() in [ 'gbk','gb2312']:
return True
elif code.lower() == 'utf8' or code.lower() == 'utf-8':
code = 'utf8'
break
if code != 'utf8':
return False
# here handle utf8
# to detect any chinese char win
try:
ucon = content.decode('utf8')
for uchar in ucon:
i = ord(uchar)
if i >= 0x4e00 and i <= 0x9fa5:
return True
except Exception, e:
print url, e
pass
return False
评论列表
文章目录