def text(self, target=None, ignore_pureascii_words=False):
"""
Get all text in HTML, skip script and comment
:param target: the BeatuifulSoup object, default self.b
:param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
:return: list of str
"""
if target is None:
target = self.b
from bs4 import Comment
from bs4.element import NavigableString,Doctype
result = []
for descendant in target.descendants:
if not isinstance(descendant, NavigableString) \
or isinstance(descendant,Doctype) \
or descendant.parent.name in ["script", "style"] \
or isinstance(descendant, Comment) \
or "none" in descendant.parent.get("style","")\
or "font-size:0px" in descendant.parent.get("style",""):
continue
data = descendant.strip()
if len(data) > 0:
if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
if PY2:
result.append(data.encode())
else:
result.append(data)
return result
python类Doctype()的实例源码
def text(self, target=None, ignore_pureascii_words=False):
"""
Get all text in HTML, skip script and comment
:param target: the BeatuifulSoup object, default self.b
:param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
:return: list of str
"""
if target is None:
target = self.b
from bs4 import Comment
from bs4.element import NavigableString,Doctype
result = []
for descendant in target.descendants:
if not isinstance(descendant, NavigableString) \
or isinstance(descendant,Doctype) \
or descendant.parent.name in ["script", "style"] \
or isinstance(descendant, Comment) \
or "none" in descendant.parent.get("style","")\
or "font-size:0px" in descendant.parent.get("style",""):
continue
data = descendant.strip()
if len(data) > 0:
if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
if PY2:
result.append(data.encode())
else:
result.append(data)
return result
def is_doctype(self):
''' Check if this element is a doctype
'''
return isinstance(self.context, Doctype)
def _test_doctype(self, doctype_fragment):
"""Run a battery of assertions on a given doctype string."""
doctype_str = '<!DOCTYPE %s>' % doctype_fragment
markup = doctype_str + '<p>foo</p>'
soup = self.soup(markup)
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment)
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')