def __init__(self, data, autoclose_tags=[], print_tags = False, output = sys.stdout, warnaction = "default", warngoal = sys.stderr, caller_id = 0):
HTMLParser.__init__(self)
DATAtree.__init__(self, output, warnaction, warngoal, caller_id)
with self.tree_lock:
self.tree_type ='html'
self.print_tags = print_tags
self.autoclose_tags = autoclose_tags
self.is_tail = False
self.root = HTMLnode(self, 'root')
self.current_node = self.root
self.last_node = None
self.text = u''
self.open_tags = {}
self.count_tags(data)
# read the html page into the tree
try:
# Cover for incomplete reads where the essentiel body part is retrieved
for ctag in ('body', 'BODY', 'html', 'HTML', 'xml', 'XML'):
if u'<%s>' % (ctag, ) in data and not u'</%s>' % (ctag, ) in data:
data = u'%s</%s>' % (data, ctag)
self.feed(data)
self.reset()
self.start_node = self.root
except:
self.warn('Unable to parse the HTML data. Invalid dataset!', dtDataWarning, 1)
self.start_node = NULLnode()
评论列表
文章目录