def unescape(text):
"""Removes HTML or XML character references
and entities from a text string.
keep &, >, < in the source code.
from Fredrik Lundh
http://effbot.org/zone/re-sub.htm#unescape-html
"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16)).encode("utf-8")
else:
return unichr(int(text[2:-1])).encode("utf-8")
except ValueError:
logger.info("error de valor")
pass
else:
# named entity
try:
'''
if text[1:-1] == "amp":
text = "&amp;"
elif text[1:-1] == "gt":
text = "&gt;"
elif text[1:-1] == "lt":
text = "&lt;"
else:
print text[1:-1]
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8")
'''
import htmlentitydefs
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8")
except KeyError:
logger.info("keyerror")
pass
except:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
# Convierte los codigos html "ñ" y lo reemplaza por "ñ" caracter unicode utf-8
评论列表
文章目录