def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
if not self.elementstack:
return
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
text = '&%s;' % ref
elif ref in self.entities:
text = self.entities[ref]
if text.startswith('&#') and text.endswith(';'):
return self.handle_entityref(text)
else:
try:
name2codepoint[ref]
except KeyError:
text = '&%s;' % ref
else:
text = chr(name2codepoint[ref]).encode('utf-8')
self.elementstack[-1][2].append(text)
python类name2codepoint()的实例源码
def unescape(text):
def fix_up(m):
text_ = m.group(0)
code = m.group(1)
try:
if text_[1] == "#": # character reference
if text_[2] == "x":
return chr(int(code[1:], 16))
else:
return chr(int(code))
else: # named entity
return chr(name2codepoint[code])
except (KeyError, ValueError):
return text_ # leave as is
return re.sub("&#?(\w+);", fix_up, text)
# Match HTML comments
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
if not self.elementstack:
return
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
text = '&%s;' % ref
elif ref in self.entities:
text = self.entities[ref]
if text.startswith('&#') and text.endswith(';'):
return self.handle_entityref(text)
else:
try:
name2codepoint[ref]
except KeyError:
text = '&%s;' % ref
else:
text = chr(name2codepoint[ref]).encode('utf-8')
self.elementstack[-1][2].append(text)
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
if not self.elementstack:
return
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
text = '&%s;' % ref
elif ref in self.entities:
text = self.entities[ref]
if text.startswith('&#') and text.endswith(';'):
return self.handle_entityref(text)
else:
try:
name2codepoint[ref]
except KeyError:
text = '&%s;' % ref
else:
text = chr(name2codepoint[ref]).encode('utf-8')
self.elementstack[-1][2].append(text)
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
if not self.elementstack:
return
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
text = '&%s;' % ref
elif ref in self.entities:
text = self.entities[ref]
if text.startswith('&#') and text.endswith(';'):
return self.handle_entityref(text)
else:
try:
name2codepoint[ref]
except KeyError:
text = '&%s;' % ref
else:
text = chr(name2codepoint[ref]).encode('utf-8')
self.elementstack[-1][2].append(text)
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
if not self.elementstack:
return
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
text = '&%s;' % ref
elif ref in self.entities:
text = self.entities[ref]
if text.startswith('&#') and text.endswith(';'):
return self.handle_entityref(text)
else:
try:
name2codepoint[ref]
except KeyError:
text = '&%s;' % ref
else:
text = chr(name2codepoint[ref]).encode('utf-8')
self.elementstack[-1][2].append(text)
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
def htmlentity_transform(entity):
"""Transforms an HTML entity to a character."""
# Known non-numeric HTML entity
try:
if entity in compat_html_entities.name2codepoint:
return compat_chr(compat_html_entities.name2codepoint[entity])
except Exception: pass
mobj = re.match(r'#(x?[0-9A-Fa-f]+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
try:
ret = compat_chr(int(numstr, base))
return ret
except Exception:
printExc()
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
:param text The HTML (or XML) source text.
:return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
code = m.group(1)
try:
if text[1] == "#": # character reference
if text[2] == "x":
return chr(int(code[1:], 16))
else:
return chr(int(code))
else: # named entity
return chr(name2codepoint[code])
except:
return text # leave as is
return re.sub("&#?(\w+);", fixup, text)
# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
def handle_entityref(self, name):
if name in name2codepoint and not self.hide_output:
code = name2codepoint[name]
self.push_text("&#" + str(code) + ";")
def entity2text(entitydef):
"""Convert an HTML entity reference into unicode.
http://stackoverflow.com/a/58125/408556
"""
if entitydef.startswith('&#x'):
cp = int(entitydef[3:-1], 16)
elif entitydef.startswith('&#'):
cp = int(entitydef[2:-1])
elif entitydef.startswith('&'):
cp = name2codepoint[entitydef[1:-1]]
else:
logger.debug(entitydef)
cp = None
return chr(cp) if cp else entitydef
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
# Reconstruct the original entity reference.
if ref in name2codepoint or ref == 'apos':
self.pieces.append('&%s;' % ref)
else:
self.pieces.append('&%s' % ref)
def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
:param text The HTML (or XML) source text.
:return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
code = m.group(1)
try:
if text[1] == "#": # character reference
if text[2] == "x":
return chr(int(code[1:], 16))
else:
return chr(int(code))
else: # named entity
return chr(name2codepoint[code])
except:
return text # leave as is
return re.sub("&#?(\w+);", fixup, text)
# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
def un_escape(self, text):
# Removes HTML or XML character references and entities from a text string.
# source: http://effbot.org/zone/re-sub.htm#unescape-html
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
if not isinstance(text,(str, unicode)):
return text
return unicode(re.sub("&#?\w+;", fixup, text))
def handle_entityref(self, name):
try:
c = unichr(name2codepoint[name])
self.text += c
except:
pass
def htmlentitydecode(s):
return re.sub(
'&(%s);' % '|'.join(name2codepoint),
lambda m: unichr(name2codepoint[m.group(1)]), s)
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
:param text The HTML (or XML) source text.
:return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
code = m.group(1)
try:
if text[1] == "#": # character reference
if text[2] == "x":
return chr(int(code[1:], 16))
else:
return chr(int(code))
else: # named entity
return chr(name2codepoint[code])
except:
return text # leave as is
return re.sub("&#?(\w+);", fixup, text)
# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
:param text The HTML (or XML) source text.
:return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
code = m.group(1)
try:
if text[1] == "#": # character reference
if text[2] == "x":
return chr(int(code[1:], 16))
else:
return chr(int(code))
else: # named entity
return chr(name2codepoint[code])
except:
return text # leave as is
return re.sub("&#?(\w+);", fixup, text)
# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
# Reconstruct the original entity reference.
if ref in name2codepoint or ref == 'apos':
self.pieces.append('&%s;' % ref)
else:
self.pieces.append('&%s' % ref)
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
# Reconstruct the original entity reference.
if ref in name2codepoint or ref == 'apos':
self.pieces.append('&%s;' % ref)
else:
self.pieces.append('&%s' % ref)
def html_entity_decode(s):
result = re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: str(unichr(name2codepoint[m.group(1)])), s)
result = re.sub(r'&#(\d{2,3});', lambda m: chr(int(m.group(1))), result)
return result
def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
:param text The HTML (or XML) source text.
:return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
code = m.group(1)
try:
if text[1] == "#": # character reference
if text[2] == "x":
return chr(int(code[1:], 16))
else:
return chr(int(code))
else: # named entity
return chr(name2codepoint[code])
except:
return text # leave as is
return re.sub("&#?(\w+);", fixup, text)
# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
def unescape_html(text):
"""
Removes HTML or XML character references and entities from a text string.
@param text The HTML (or XML) source text.
@return The plain text, as a Unicode string, if necessary.
Source: http://effbot.org/zone/re-sub.htm#unescape-html
"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return chr(int(text[3:-1], 16))
else:
return chr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = chr(htmlentities.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
:param text The HTML (or XML) source text.
:return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
code = m.group(1)
try:
if text[1] == "#": # character reference
if text[2] == "x":
return chr(int(code[1:], 16))
else:
return chr(int(code))
else: # named entity
return chr(name2codepoint[code])
except:
return text # leave as is
return re.sub("&#?(\w+);", fixup, text)
# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])