python类HTMLParser()的实例源码-面圈网

emuapi.py 文件源码项目：pi_romulus 作者: ArthurMoore85 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self):
        super(EmuApi, self).__init__()
        self.service = 'EmuParadise'
        self.base_url = 'https://www.emuparadise.me'
        self.referrer = None
        self._parser = HTMLParser.HTMLParser()
        self.endpoints = ENDPOINTS
        self.response = self.get_response()
        self.search_regex = '<div class="roms">' \
                            '<a .*?href="(.*?)">(.*?)</a>.*?' \
                            '<a href="\/roms\/roms\.php\?sysid=(\d+)".*?class="sysname">' \
                            '(.*?)</a>.*?<b>Size:</b> (.*?) .*?</div>'
        self.download_url = 'http://direct.emuparadise.me/roms/get-download.php?gid={download_id}' \
                            '&token={token}' \
                            '&mirror_available=true'
        self.requires_arguments = True
        self.token = '211217baa2d87c57b360b9a673a12cfd'

ComicHelper.py 文件源码项目：CorpBot.py 作者: corpnewt 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def getXKCDImageTitle ( html ):
    comicBlock = find_last_between( html, 'div id="comic"', "</div>")

    if not comicBlock:
        return None

    imageTitle = find_last_between( comicBlock, "alt=", ">" )
    # Drop srcset= if there
    imageTitle = imageTitle.split('srcset=')[0]
    h = HTMLParser()
    imageTitle = h.unescape(imageTitle)
    imageTitle = imageTitle.replace('"', '').strip()
    imageTitle = imageTitle.replace('/', '').strip()
    return imageTitle

# Garfield Minus Garfield Methods

_htmlparser.py 文件源码项目：TACTIC-Handler 作者: listyque 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

_htmlparser.py 文件源码项目：TACTIC-Handler 作者: listyque 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

bioqueue.py 文件源码项目：BioQueue 作者: liyao001 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def get_steps(protocol_id):
    """
    Get steps of a protocol
    :param protocol_id: int, protocol id
    :return: list, list of unresolved steps
    """
    step_list = []

    steps = Protocol.objects.filter(parent=protocol_id).order_by('step_order')
    html_parser = HTMLParser()
    workspace_path = settings['env']['workspace']
    for index, step in enumerate(steps):
        # priority for self-compiled tool
        software_path = os.path.join(os.path.join(os.path.join(workspace_path, str(step.user_id)), 'bin'),
                                     str(step.software))
        if os.path.exists(software_path) and os.path.isfile(software_path):
            step.software = software_path
        step_list.append({
            'id': index,
            'parameter': html_parser.unescape(str(step.software).rstrip() + " " + str(step.parameter)),
            'specify_output': step.specify_output,
            'hash': step.hash,
        })
    return step_list

_htmlparser.py 文件源码项目：llk 作者: Tycx2ry 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

_htmlparser.py 文件源码项目：llk 作者: Tycx2ry 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

twitter.py 文件源码项目：Taigabot 作者: FrozenPigs 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def twitter_url(match, bot=None):
    # Find the tweet ID from the URL
    tweet_id = match.group(1)

    # Get the tweet using the tweepy API
    api = get_api(bot)
    if not api:
        return
    try:
        tweet = api.get_status(tweet_id)
        user = tweet.user
    except tweepy.error.TweepError:
        return

    # Format the return the text of the tweet
    text = " ".join(tweet.text.split())

    if user.verified:
        prefix = u"\u2713"
    else:
        prefix = ""

    time = timesince.timesince(tweet.created_at, datetime.utcnow())
    h = HTMLParser()
    return u"{}@\x02{}\x02 ({}): {} ({} ago)".format(prefix, user.screen_name, user.name, h.unescape(text), time)

magic.py 文件源码项目：electron-crash-reporter 作者: lipis 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def insert_to(project_url, destination, find_what, indent=0):
  url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
  response = urllib2.urlopen(url)
  if response.getcode() == 200:
    with open(destination, 'r') as dest:
      dest_contents = dest.readlines()
      lines = ''.join(dest_contents)
      content = HTMLParser().unescape(response.read())
      if content.replace(' ', '') in lines.replace(' ', ''):
        print_out('IGNORED', destination)
        return

    generated = []
    for line in dest_contents:
      generated.append(line)
      if line.lower().find(find_what.lower()) >= 0:
        spaces = len(line) - len(line.lstrip())
        for l in content.split('\n'):
          if l:
            generated.append('%s%s\n' % (' ' * (spaces + indent), l))

    with open(destination, 'w') as dest:
      for line in generated:
        dest.write(line)
      print_out('INSERT', destination)

_htmlparser.py 文件源码项目：weeman 作者: evait-security 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

_htmlparser.py 文件源码项目：weeman 作者: evait-security 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

google.py 文件源码项目：searchForAll 作者: MemoryAndDream 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def process(keyword,page):
    url='https://www.google.com/search?q=%s&start=%s&num=100'%(keyword,page*100)
    urlinfos=[]
    #urlinfo1={"url":"http://www.baidu.com/link?url=966OdUyxuwFJoAYx_XGYq7_FiVLcej4qEA3Q84e-lLAtLPRGGHA6tsNFNsTN9zka&wd=&eqid=a64931cc000026c3000000035994fd9e","title":"python Django?? ?????????????????..._???","info":'? W3School,???????????????? jQuery ??? jQuery ??jQuery ???? ?W3School,???????? jQuery ????????????? jQuery...'}
    page = ct.crawlerTool.getPage(url)
    #print page
    #print url
    segments = ct.crawlerTool.getXpath('//div[@class="g"]',page)
    #print segments
    for segment in segments:
        #print segment
        try:
            urlinfo={}
            urlinfo['url']= ct.crawlerTool.getXpath('//h3/a/@href',segment)[0]#/text()???????
            urlinfo['title'] = ct.crawlerTool.getXpath('//h3/a/text()',segment)[0]
            urlinfo['info'] =  HTMLParser().unescape(ct.crawlerTool.extractorText(ct.crawlerTool.getXpath('//div[@class="s"]', segment)))
            #print urlinfo['url'],urlinfo['title'],urlinfo['info']
            #info??????????
            urlinfos.append(urlinfo)
        except:
            print('error')
            traceback.print_exc()
    return {"urlinfos":urlinfos}

magic.py 文件源码项目：meet-notes 作者: lipis 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def insert_to(project_url, destination, find_what, indent=0):
  url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
  response = urllib2.urlopen(url)
  if response.getcode() == 200:
    with open(destination, 'r') as dest:
      dest_contents = dest.readlines()
      lines = ''.join(dest_contents)
      content = HTMLParser().unescape(response.read())
      if content.replace(' ', '') in lines.replace(' ', ''):
        print_out('IGNORED', destination)
        return

    generated = []
    for line in dest_contents:
      generated.append(line)
      if line.lower().find(find_what.lower()) >= 0:
        spaces = len(line) - len(line.lstrip())
        for l in content.split('\n'):
          if l:
            generated.append('%s%s\n' % (' ' * (spaces + indent), l))

    with open(destination, 'w') as dest:
      for line in generated:
        dest.write(line)
      print_out('INSERT', destination)

gtest_test_instance.py 文件源码项目：chromium-build 作者: discordapp 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def ParseGTestXML(xml_content):
  """Parse gtest XML result."""
  results = []

  html = HTMLParser.HTMLParser()

  # TODO(jbudorick): Unclear how this handles crashes.
  testsuites = xml.etree.ElementTree.fromstring(xml_content)
  for testsuite in testsuites:
    suite_name = testsuite.attrib['name']
    for testcase in testsuite:
      case_name = testcase.attrib['name']
      result_type = base_test_result.ResultType.PASS
      log = []
      for failure in testcase:
        result_type = base_test_result.ResultType.FAIL
        log.append(html.unescape(failure.attrib['message']))

      results.append(base_test_result.BaseTestResult(
          '%s.%s' % (suite_name, TestNameWithoutDisabledPrefix(case_name)),
          result_type,
          int(float(testcase.attrib['time']) * 1000),
          log=('\n'.join(log) if log else '')))

  return results

util.py 文件源码项目：chandl 作者: gebn 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def unescape_html(html_):
    """
    Replace HTML entities (e.g. `&pound;`) in a string.

    :param html_: The escaped HTML.
    :return: The input string with entities replaces.
    """

    # http://stackoverflow.com/a/2360639

    if sys.version_info.major == 2:  # 2.7
        # noinspection PyUnresolvedReferences,PyCompatibility
        from HTMLParser import HTMLParser
        return HTMLParser().unescape(html_)

    if sys.version_info.minor == 3:  # 3.3
        # noinspection PyCompatibility
        from html.parser import HTMLParser
        # noinspection PyDeprecation
        return HTMLParser().unescape(html_)

    # 3.4+
    # noinspection PyCompatibility
    import html
    return html.unescape(html_)

feedscanner.py 文件源码项目：iitb-blog-aggregator 作者: ranveeraggarwal 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def feeds(page_url):
    """Search the given URL for possible feeds, returning a list of them."""

    # If the URL is a feed, there's no need to scan it for links.
    if is_feed(page_url):
        return [page_url]

    data = fetch_url(page_url)
    parser = FeedFinder(page_url)
    try:
        parser.feed(data)
    except HTMLParser.HTMLParseError:
        pass
    found = parser.urls()

    # Return only feeds that feedparser can understand.
    return [feed for feed in found if is_feed(feed)]

_htmlparser.py 文件源码项目：catchup4kodi 作者: catchup4kodi 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

_htmlparser.py 文件源码项目：catchup4kodi 作者: catchup4kodi 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

auth.py 文件源码项目：plugin.video.brplay 作者: olavopeixoto 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def _provider_auth(self, url, qs, username, password, html):

        url += '?sid=0'
        # prepare auth
        r = self.session.post(url + '&id=tve&option=credential', proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})

        # authenticate
        post_data = {
            'option': 'credential',
            'urlRedirect': url,
            'Ecom_User_ID': username,
            'Ecom_Password': password,
        }
        r1 = self.session.post(url, data=post_data, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})

        r2 = self.session.get(url, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})

        try:
            html_parser = HTMLParser.HTMLParser()
            redirurl = re.findall(r'<form method=\"POST\" enctype=\"application/x-www-form-urlencoded\" action=\"(.*)\">', r2.text)[0]
            argsre = dict([(match.group(1), html_parser.unescape(match.group(2))) for match in re.finditer(r'<input type=\"hidden\" name=\"(\w+)\" value=\"([^\"]+)\"/>', r2.text)])

            return self.session.post(redirurl, data=argsre, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})
        except:
            raise Exception('Invalid user name or password.')

_htmlparser.py 文件源码项目：respeaker_virtualenv 作者: respeaker 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

_htmlparser.py 文件源码项目：respeaker_virtualenv 作者: respeaker 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

_htmlparser.py 文件源码项目：tellmeabout.coffee 作者: billyfung 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

_htmlparser.py 文件源码项目：tellmeabout.coffee 作者: billyfung 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

nginx_CVE_2017_7529.py 文件源码项目：xunfeng 作者: ysrc 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def get_url(domain,port,timeout):
    url_list = []
    if port ==443:
        surl = 'https://' + domain
    else:
        surl = 'http://' + domain
    res = urllib2.urlopen(surl, timeout=timeout)
    html = res.read()
    root_url = res.geturl()
    m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I)
    if m:
        for url in m:
            ParseResult = urlparse.urlparse(url[1])
            if ParseResult.netloc and ParseResult.scheme:
                if domain == ParseResult.hostname:
                    url_list.append(HTMLParser.HTMLParser().unescape(url[1]))
            elif not ParseResult.netloc and not ParseResult.scheme:
                url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1])))
    return list(set(url_list))

gtest_test_instance.py 文件源码项目：gn_build 作者: realcome 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def ParseGTestXML(xml_content):
  """Parse gtest XML result."""
  results = []

  html = HTMLParser.HTMLParser()

  # TODO(jbudorick): Unclear how this handles crashes.
  testsuites = xml.etree.ElementTree.fromstring(xml_content)
  for testsuite in testsuites:
    suite_name = testsuite.attrib['name']
    for testcase in testsuite:
      case_name = testcase.attrib['name']
      result_type = base_test_result.ResultType.PASS
      log = []
      for failure in testcase:
        result_type = base_test_result.ResultType.FAIL
        log.append(html.unescape(failure.attrib['message']))

      results.append(base_test_result.BaseTestResult(
          '%s.%s' % (suite_name, case_name),
          result_type,
          int(float(testcase.attrib['time']) * 1000),
          log=('\n'.join(log) if log else '')))

  return results

_htmlparser.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

_htmlparser.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

test_htmlparser.py 文件源码项目：oil 作者: oilshell 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector)

test_htmlparser.py 文件源码项目：python2-tracer 作者: extremecoders-re 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector)

main.py 文件源码项目：jaychou 作者: fantasysea 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def lrc2dict(lrc):
    time_stamps = re.findall(r'\[[^\]]+\]', lrc)
    html_parser = HTMLParser.HTMLParser()
    if time_stamps:
        # ????
        lyric = lrc
        for tplus in time_stamps:
            lyric = lyric.replace(tplus, '').replace('\r', '').replace('\n', '').replace('????','').replace('???','').replace('?????','').replace('???','').replace('??','').replace('??','').replace('??','').replace('??','')
            lyric = lyric.replace('???', '').replace('??', '').replace('????', '').replace('???', '').replace('??', '').replace('???', '')
        # ????
        # tplus: [02:31.79]
        # t 02:31.79
        # print lyric
        print html_parser.unescape(lyric)
        return html_parser.unescape(lyric)
    else:
        return ''