def decode_html(html_string):
"""??BS4?UnicodeDammit???????, ???unicode??, ???????100%, ????????
"""
dammit = UnicodeDammit(html_string, ['GB2312', 'GBK', 'GB18030'], smart_quotes_to="html", is_html=True)
doc = dammit.unicode_markup
#print("dammit —— ", dammit.original_encoding)
# FIXME ???????'ISO-8859-2', ??????, ????????????
if dammit.original_encoding == 'ISO-8859-2':
enc = get_encoding(html_string)
print(enc)
enc ="utf-8"
doc = html_string.decode(enc)
elif not dammit.unicode_markup:
raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(dammit.triedEncodings))
# print(doc.encode('utf-8'))
return doc
评论列表
文章目录