def translate_non_sgml_chars(data, enc='utf-8'):
# type: (bytes, str) -> bytes
def replace_non_sgml(m):
# type: (Match) -> str
codepoint = ord(m.group(0))
if 127 <= codepoint <= 159:
try:
return int2byte(codepoint).decode('windows-1252')
except UnicodeDecodeError:
pass
# Unicode Character 'REPLACEMENT CHARACTER'
return u'\ufffd'
text = data.decode(enc, 'replace')
text = re.sub(unistr(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]'), replace_non_sgml, text)
return text.encode(enc, 'replace')
评论列表
文章目录