python类name2codepoint()的实例源码

feedparser.py 文件源码 项目:SublimeRSS 作者: JaredMHall 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '©', ref will be 'copy'
        if not self.elementstack:
            return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        elif ref in self.entities:
            text = self.entities[ref]
            if text.startswith('&#') and text.endswith(';'):
                return self.handle_entityref(text)
        else:
            try:
                name2codepoint[ref]
            except KeyError:
                text = '&%s;' % ref
            else:
                text = chr(name2codepoint[ref]).encode('utf-8')
        self.elementstack[-1][2].append(text)
wikiextractor.py 文件源码 项目:false-friends 作者: pln-fing-udelar 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def unescape(text):
    def fix_up(m):
        text_ = m.group(0)
        code = m.group(1)
        try:
            if text_[1] == "#":  # character reference
                if text_[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except (KeyError, ValueError):
            return text_  # leave as is

    return re.sub("&#?(\w+);", fix_up, text)


# Match HTML comments
feedparser.py 文件源码 项目:textnews 作者: qznc 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '©', ref will be 'copy'
        if not self.elementstack:
            return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        elif ref in self.entities:
            text = self.entities[ref]
            if text.startswith('&#') and text.endswith(';'):
                return self.handle_entityref(text)
        else:
            try:
                name2codepoint[ref]
            except KeyError:
                text = '&%s;' % ref
            else:
                text = chr(name2codepoint[ref]).encode('utf-8')
        self.elementstack[-1][2].append(text)
feedparser.py 文件源码 项目:MIT-6.0001-Problem-sets-solution 作者: cantell 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '©', ref will be 'copy'
        if not self.elementstack:
            return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        elif ref in self.entities:
            text = self.entities[ref]
            if text.startswith('&#') and text.endswith(';'):
                return self.handle_entityref(text)
        else:
            try:
                name2codepoint[ref]
            except KeyError:
                text = '&%s;' % ref
            else:
                text = chr(name2codepoint[ref]).encode('utf-8')
        self.elementstack[-1][2].append(text)
mixin.py 文件源码 项目:machine-learning-python 作者: pspxiaochen 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '©', ref will be 'copy'
        if not self.elementstack:
            return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        elif ref in self.entities:
            text = self.entities[ref]
            if text.startswith('&#') and text.endswith(';'):
                return self.handle_entityref(text)
        else:
            try:
                name2codepoint[ref]
            except KeyError:
                text = '&%s;' % ref
            else:
                text = chr(name2codepoint[ref]).encode('utf-8')
        self.elementstack[-1][2].append(text)
mixin.py 文件源码 项目:machine-learning-python 作者: pspxiaochen 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '©', ref will be 'copy'
        if not self.elementstack:
            return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        elif ref in self.entities:
            text = self.entities[ref]
            if text.startswith('&#') and text.endswith(';'):
                return self.handle_entityref(text)
        else:
            try:
                name2codepoint[ref]
            except KeyError:
                text = '&%s;' % ref
            else:
                text = chr(name2codepoint[ref]).encode('utf-8')
        self.elementstack[-1][2].append(text)
read.py 文件源码 项目:k-clique-graphs-dense-subgraphs 作者: giannisnik 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)
utils.py 文件源码 项目:crossplatform_iptvplayer 作者: j00zek 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def htmlentity_transform(entity):
    """Transforms an HTML entity to a character."""
    # Known non-numeric HTML entity
    try:
        if entity in compat_html_entities.name2codepoint:
            return compat_chr(compat_html_entities.name2codepoint[entity])
    except Exception: pass

    mobj = re.match(r'#(x?[0-9A-Fa-f]+)', entity)
    if mobj is not None:
        numstr = mobj.group(1)
        if numstr.startswith(u'x'):
            base = 16
            numstr = u'0%s' % numstr
        else:
            base = 10
        try:
            ret = compat_chr(int(numstr, base))
            return ret
        except Exception:
            printExc()
    # Unknown entity in name, return its literal representation
    return (u'&%s;' % entity)
WikiExtractor.py 文件源码 项目:WikiExtractor_To_the_one_text 作者: j-min 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
html2text.py 文件源码 项目:nstock 作者: ybenitezf 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])
html2tele.py 文件源码 项目:progrobot 作者: petr-kalinin 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def handle_entityref(self, name):
        if name in name2codepoint and not self.hide_output:
            code = name2codepoint[name]
            self.push_text("&#" + str(code) + ";")
parsers.py 文件源码 项目:riko 作者: nerevu 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def entity2text(entitydef):
    """Convert an HTML entity reference into unicode.
    http://stackoverflow.com/a/58125/408556
    """
    if entitydef.startswith('&#x'):
        cp = int(entitydef[3:-1], 16)
    elif entitydef.startswith('&#'):
        cp = int(entitydef[2:-1])
    elif entitydef.startswith('&'):
        cp = name2codepoint[entitydef[1:-1]]
    else:
        logger.debug(entitydef)
        cp = None

    return chr(cp) if cp else entitydef
feedparser.py 文件源码 项目:SublimeRSS 作者: JaredMHall 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '©', ref will be 'copy'
        # Reconstruct the original entity reference.
        if ref in name2codepoint or ref == 'apos':
            self.pieces.append('&%s;' % ref)
        else:
            self.pieces.append('&%s' % ref)
WikiExtractor.py 文件源码 项目:wikipedia_multilang 作者: ivanvladimir 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
DataTreeGrab.py 文件源码 项目:DataTree 作者: tvgrabbers 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def un_escape(self, text):
        # Removes HTML or XML character references and entities from a text string.
        # source: http://effbot.org/zone/re-sub.htm#unescape-html
        #
        # @param text The HTML (or XML) source text.
        # @return The plain text, as a Unicode string

        def fixup(m):
            text = m.group(0)
            if text[:2] == "&#":
                # character reference
                try:
                    if text[:3] == "&#x":
                        return unichr(int(text[3:-1], 16))

                    else:
                        return unichr(int(text[2:-1]))

                except ValueError:
                    pass

            else:
                # named entity
                try:
                    text = unichr(name2codepoint[text[1:-1]])

                except KeyError:
                    pass

            return text # leave as is

        if not isinstance(text,(str, unicode)):
            return text

        return unicode(re.sub("&#?\w+;", fixup, text))
DataTreeGrab.py 文件源码 项目:DataTree 作者: tvgrabbers 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def handle_entityref(self, name):
        try:
            c = unichr(name2codepoint[name])
            self.text += c

        except:
            pass
util.py 文件源码 项目:twittershade 作者: nicolavic98 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def htmlentitydecode(s):
    return re.sub(
        '&(%s);' % '|'.join(name2codepoint),
        lambda m: unichr(name2codepoint[m.group(1)]), s)
html2text.py 文件源码 项目:ExptWizNote 作者: Ext4FAT 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])
htmlcleaner.py 文件源码 项目:catchup4kodi 作者: catchup4kodi 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])
WikiExtractor.py 文件源码 项目:easygen 作者: markriedl 项目源码 文件源码 阅读 102 收藏 0 点赞 0 评论 0
def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
html2text.py 文件源码 项目:localdocindex 作者: stcioc 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])
WikiExtractor.py 文件源码 项目:langpred 作者: chrisdamba 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
feedparser.py 文件源码 项目:textnews 作者: qznc 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '©', ref will be 'copy'
        # Reconstruct the original entity reference.
        if ref in name2codepoint or ref == 'apos':
            self.pieces.append('&%s;' % ref)
        else:
            self.pieces.append('&%s' % ref)
feedparser.py 文件源码 项目:MIT-6.0001-Problem-sets-solution 作者: cantell 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '©', ref will be 'copy'
        # Reconstruct the original entity reference.
        if ref in name2codepoint or ref == 'apos':
            self.pieces.append('&%s;' % ref)
        else:
            self.pieces.append('&%s' % ref)
string_util.py 文件源码 项目:hq 作者: rbwinslow 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def html_entity_decode(s):
    result = re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: str(unichr(name2codepoint[m.group(1)])), s)
    result = re.sub(r'&#(\d{2,3});', lambda m: chr(int(m.group(1))), result)
    return result
WikiExtractor.py 文件源码 项目:Word2VecAndTsne 作者: jeffThompson 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
utils.py 文件源码 项目:aiosolr 作者: TigorC 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def unescape_html(text):
    """
    Removes HTML or XML character references and entities from a text string.

    @param text The HTML (or XML) source text.
    @return The plain text, as a Unicode string, if necessary.

    Source: http://effbot.org/zone/re-sub.htm#unescape-html
    """
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return chr(int(text[3:-1], 16))
                else:
                    return chr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = chr(htmlentities.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text  # leave as is
    return re.sub("&#?\w+;", fixup, text)
html2text.py 文件源码 项目:script.reddit.reader 作者: gedisony 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])
WikiExtractor.py 文件源码 项目:wiki_zh_vec 作者: zhouhoo 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"
html2text.py 文件源码 项目:googMeow 作者: aaaddress1 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])


问题


面经


文章

微信
公众号

扫码关注公众号