scrapertools.py 文件源码

python
阅读 25 收藏 0 点赞 0 评论 0

项目:tvalacarta 作者: tvalacarta 项目源码 文件源码
def unescape(text):
    """Removes HTML or XML character references 
       and entities from a text string.
       keep &, >, < in the source code.
    from Fredrik Lundh
    http://effbot.org/zone/re-sub.htm#unescape-html
    """
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":   
                    return unichr(int(text[3:-1], 16)).encode("utf-8")
                else:
                    return unichr(int(text[2:-1])).encode("utf-8")

            except ValueError:
                logger.info("error de valor")
                pass
        else:
            # named entity
            try:
                '''
                if text[1:-1] == "amp":
                    text = "&"
                elif text[1:-1] == "gt":
                    text = ">"
                elif text[1:-1] == "lt":
                    text = "<"
                else:
                    print text[1:-1]
                    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8")
                '''
                import htmlentitydefs
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8")
            except KeyError:
                logger.info("keyerror")
                pass
            except:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)

    # Convierte los codigos html "ñ" y lo reemplaza por "ñ" caracter unicode utf-8
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号