python类HTMLParser()的实例源码

input.py 文件源码 项目:ome-model 作者: ome 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def HTML(text, encoding=None):
    """Parse the given HTML source and return a markup stream.

    Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
    iterated over multiple times:

    >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')
    >>> print(html)
    <body><h1>Foo</h1></body>
    >>> print(html.select('h1'))
    <h1>Foo</h1>
    >>> print(html.select('h1/text()'))
    Foo

    :param text: the HTML source
    :return: the parsed XML event stream
    :raises ParseError: if the HTML text is not well-formed, and error recovery
                        fails
    """
    if isinstance(text, unicode):
        # If it's unicode text the encoding should be set to None.
        # The option to pass in an incorrect encoding is for ease
        # of writing doctests that work in both Python 2.x and 3.x.
        return Stream(list(HTMLParser(StringIO(text), encoding=None)))
    return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
_htmlparser.py 文件源码 项目:svg-animation-tools 作者: parallax 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
_htmlparser.py 文件源码 项目:svg-animation-tools 作者: parallax 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
_htmlparser.py 文件源码 项目:svg-animation-tools 作者: parallax 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
_htmlparser.py 文件源码 项目:svg-animation-tools 作者: parallax 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
handle_html.py 文件源码 项目:DIS_MeituanReptile 作者: myvary 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def downloader_html(self,url):
        '''
        :param url: ??????url
        :return  ???????,????????????0
        '''
        try:
            print url
            BAIDU_UA = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'#????UA
            headers = {'User-Agent': BAIDU_UA}  ##????????User-Agent ?UA?????????????????
            data = requests.get(url, headers=headers) ##??????????????????
            html_parser = HTMLParser.HTMLParser()
            data = html_parser.unescape(data.text)
            return data
        except:
            print '?????????',url
            return '0'
_http.py 文件源码 项目:pelisalacarta-ce 作者: pelisalacarta-ce 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def http_response(self, request, response):
        if not hasattr(response, "seek"):
            response = response_seek_wrapper(response)
        http_message = response.info()
        url = response.geturl()
        ct_hdrs = http_message.getheaders("content-type")
        if is_html(ct_hdrs, url, self._allow_xhtml):
            try:
                try:
                    html_headers = parse_head(response,
                                              self.head_parser_class())
                finally:
                    response.seek(0)
            except (HTMLParser.HTMLParseError,
                    sgmllib.SGMLParseError):
                pass
            else:
                for hdr, val in html_headers:
                    # add a header
                    http_message.dict[hdr.lower()] = val
                    text = hdr + ": " + val
                    for line in text.split("\n"):
                        http_message.headers.append(line + "\n")
        return response
parsers.py 文件源码 项目:crestify 作者: crestify 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            self.html = self.opened_file.read()
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.html_parser = HTMLParser.HTMLParser()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
parsers.py 文件源码 项目:crestify 作者: crestify 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            self.data = self.opened_file.read()
        self.user = user_id
        self.data = ujson.loads(self.data)
        self.urls = dict()
        self.tags_dict = dict()
        self.tags_set = set()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for x in self.check_duplicates_query:
            self.check_duplicates[x.main_url] = x
        self.html_parser = HTMLParser.HTMLParser()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
_http.py 文件源码 项目:plugin.video.streamondemand-pureita 作者: orione7 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def http_response(self, request, response):
        if not hasattr(response, "seek"):
            response = response_seek_wrapper(response)
        http_message = response.info()
        url = response.geturl()
        ct_hdrs = http_message.getheaders("content-type")
        if is_html(ct_hdrs, url, self._allow_xhtml):
            try:
                try:
                    html_headers = parse_head(response,
                                              self.head_parser_class())
                finally:
                    response.seek(0)
            except (HTMLParser.HTMLParseError,
                    sgmllib.SGMLParseError):
                pass
            else:
                for hdr, val in html_headers:
                    # add a header
                    http_message.dict[hdr.lower()] = val
                    text = hdr + ": " + val
                    for line in text.split("\n"):
                        http_message.headers.append(line + "\n")
        return response
_htmlparser.py 文件源码 项目:Alexa-Chatter 作者: ekt1701 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
_htmlparser.py 文件源码 项目:Alexa-Chatter 作者: ekt1701 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
rtbf.py 文件源码 项目:plugin.video.rtbfauvio 作者: Gaet81 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def play_video(self, datas):
        url = datas.get('url')
        data = channel.get_url(url)
        regex = r"""src="(https://www.rtbf.be/auvio/embed/media[^"]+)"""
        iframe_url = re.findall(regex, data)[0]
        iframe_url
        data = channel.get_url(iframe_url)
        regex = r"""data-media="([^"]+)"""
        media = re.findall(regex, data)[0]        
        h = HTMLParser.HTMLParser()
        media_json = h.unescape(media)
        regex = r""""high":"([^"]+)"""
        all_url = re.findall(regex, media_json)
        if len(all_url) > 0:
            video_url = all_url[0]
        else:
            regex = r"""url&quot;:&quot;([^&]+)"""
            iframe_url = re.findall(regex, data)[0]
            video_url = iframe_url.replace("\\", "")    
        channel.playUrl(video_url)
_htmlparser.py 文件源码 项目:amazon_order_history_scraper 作者: drewctate 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
_htmlparser.py 文件源码 项目:amazon_order_history_scraper 作者: drewctate 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
belvedere_artists.py 文件源码 项目:toollabs 作者: multichill 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def getBelvedereArtistsGenerator():
    """
    Generator to return Auckland Art Gallery paintings

    """
    htmlparser = HTMLParser.HTMLParser()

    basesearchurl=u'http://digital.belvedere.at/people/%s'
    urlregex = u'\<h3\>\<a href\=\"\/people\/(?P<id>\d+)\/[^\"]+\"\>(?P<name>[^\<]+)\<\/a\>\<\/h3\>\<div\>(?P<description>[^\<]+)\<\/div\>'

    # Just loop over the pages
    for i in string.ascii_lowercase:
        searchurl = basesearchurl % (i,)
        print searchurl
        searchPage = requests.get(searchurl)

        matches = re.finditer(urlregex, searchPage.text)
        for match in matches:
            artist = {}
            artist[u'id'] = match.group(u'id')
            artist[u'name'] = htmlparser.unescape(match.group(u'name'))
            artist[u'description'] = htmlparser.unescape(match.group(u'description'))
            artist[u'url'] = u'http://digital.belvedere.at/people/%s/' % (match.group(u'id'),)
            yield artist
_htmlparser.py 文件源码 项目:kekescan 作者: xiaoxiaoleo 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
_htmlparser.py 文件源码 项目:kekescan 作者: xiaoxiaoleo 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
_htmlparser.py 文件源码 项目:mibici-tools 作者: regenhans 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)


问题


面经


文章

微信
公众号

扫码关注公众号