DataTreeGrab.py 文件源码-python代码片段

DataTreeGrab.py 文件源码

python

阅读 28 收藏 0 点赞 0 评论 0

项目：DataTree 作者: tvgrabbers 项目源码文件源码

def __init__(self, data, autoclose_tags=[], print_tags = False, output = sys.stdout, warnaction = "default", warngoal = sys.stderr, caller_id = 0):
        HTMLParser.__init__(self)
        DATAtree.__init__(self, output, warnaction, warngoal, caller_id)
        with self.tree_lock:
            self.tree_type ='html'
            self.print_tags = print_tags
            self.autoclose_tags = autoclose_tags
            self.is_tail = False
            self.root = HTMLnode(self, 'root')
            self.current_node = self.root
            self.last_node = None
            self.text = u''
            self.open_tags = {}
            self.count_tags(data)
            # read the html page into the tree
            try:
                # Cover for incomplete reads where the essentiel body part is retrieved
                for ctag in ('body', 'BODY', 'html', 'HTML', 'xml', 'XML'):
                    if u'<%s>' % (ctag, ) in data and not u'</%s>' % (ctag, ) in data:
                        data = u'%s</%s>' % (data, ctag)

                self.feed(data)
                self.reset()
                self.start_node = self.root

            except:
                self.warn('Unable to parse the HTML data. Invalid dataset!', dtDataWarning, 1)
                self.start_node = NULLnode()