def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
:param text The HTML (or XML) source text.
:return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
code = m.group(1)
try:
if text[1] == "#": # character reference
if text[2] == "x":
return chr(int(code[1:], 16))
else:
return chr(int(code))
else: # named entity
return chr(name2codepoint[code])
except:
return text # leave as is
return re.sub("&#?(\w+);", fixup, text)
# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
WikiExtractor.py 文件源码
python
阅读 23
收藏 0
点赞 0
评论 0
评论列表
文章目录