def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif isinstance(node, Declaration): # DocumentType
string = unicode(node.string)
#Slice needed to remove markup added during unicode conversion,
#but only in some versions of BeautifulSoup/Python
if string.startswith('<!') and string.endswith('>'):
string = string[2:-1]
m = self.doctype_regexp.match(string)
#This regexp approach seems wrong and fragile
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
#been modified at all
#We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1')
else:
systemId = m.group('systemId2')
return _base.DOCTYPE, name, publicId or "", systemId or ""
elif isinstance(node, Comment):
string = unicode(node.string)
if string.startswith('<!--') and string.endswith('-->'):
string = string[4:-3]
return _base.COMMENT, string
elif isinstance(node, unicode): # TextNode
return _base.TEXT, node
elif isinstance(node, Tag): # Element
return (_base.ELEMENT, namespaces["html"], node.name,
dict(node.attrs).items(), node.contents)
else:
return _base.UNKNOWN, node.__class__.__name__
python类Comment()的实例源码
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def expand_html(html, cdict=None):
if not have_soup:
raise RuntimeError("Missing BeautifulSoup")
soup = BeautifulSoup(html)
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
for txt in soup.findAll(text=True):
if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
ntxt = regex_link.sub(
lambda match: expand_one(match.group(0), cdict), txt)
txt.replaceWith(BeautifulSoup(ntxt))
return str(soup)
def expand_html(html, cdict=None):
if not have_soup:
raise RuntimeError("Missing BeautifulSoup")
soup = BeautifulSoup(html)
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
for txt in soup.findAll(text=True):
if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
ntxt = regex_link.sub(
lambda match: expand_one(match.group(0), cdict), txt)
txt.replaceWith(BeautifulSoup(ntxt))
return str(soup)
def expand_html(html, cdict=None):
if not have_soup:
raise RuntimeError("Missing BeautifulSoup")
soup = BeautifulSoup(html)
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
for txt in soup.findAll(text=True):
if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
ntxt = regex_link.sub(
lambda match: expand_one(match.group(0), cdict), txt)
txt.replaceWith(BeautifulSoup(ntxt))
return str(soup)
def expand_html(html, cdict=None):
if not have_soup:
raise RuntimeError("Missing BeautifulSoup")
soup = BeautifulSoup(html)
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
for txt in soup.findAll(text=True):
if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
ntxt = regex_link.sub(
lambda match: expand_one(match.group(0), cdict), txt)
txt.replaceWith(BeautifulSoup(ntxt))
return str(soup)
def expand_html(html, cdict=None):
if not have_soup:
raise RuntimeError("Missing BeautifulSoup")
soup = BeautifulSoup(html)
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
for txt in soup.findAll(text=True):
if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
ntxt = regex_link.sub(
lambda match: expand_one(match.group(0), cdict), txt)
txt.replaceWith(BeautifulSoup(ntxt))
return str(soup)
def expand_html(html, cdict=None):
if not have_soup:
raise RuntimeError("Missing BeautifulSoup")
soup = BeautifulSoup(html)
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
for txt in soup.findAll(text=True):
if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
ntxt = regex_link.sub(
lambda match: expand_one(match.group(0), cdict), txt)
txt.replaceWith(BeautifulSoup(ntxt))
return str(soup)
def expand_html(html, cdict=None):
if not have_soup:
raise RuntimeError("Missing BeautifulSoup")
soup = BeautifulSoup(html)
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
for txt in soup.findAll(text=True):
if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
ntxt = regex_link.sub(
lambda match: expand_one(match.group(0), cdict), txt)
txt.replaceWith(BeautifulSoup(ntxt))
return str(soup)
def testSerializer(element):
import re
rv = []
def serializeElement(element, indent=0):
if isinstance(element, Declaration):
doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1') or ""
else:
systemId = m.group('systemId2')
if publicId is not None or systemId is not None:
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
(' '*indent, name, publicId or "", systemId or ""))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
elif isinstance(element, BeautifulSoup):
if element.name == "[document_fragment]":
rv.append("#document-fragment")
else:
rv.append("#document")
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
elif isinstance(element, unicode):
rv.append("|%s\"%s\"" %(' '*indent, element))
else:
rv.append("|%s<%s>"%(' '*indent, element.name))
if element.attrs:
for name, value in sorted(element.attrs):
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
if hasattr(element, "contents"):
for child in element.contents:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)