lispider.py 文件源码

python
阅读 27 收藏 0 点赞 0 评论 0

项目:LiSpider 作者: jay7n 项目源码 文件源码
def ParseHtmlContent(self, html_content):

        def _searching_helper_func(tag):
            templateVarsCache = {}
            ret = self._censorTagCandidateWithTemplate(tag, templateRootTag, templateVarsCache)

            if ret is True:
                self._mergeTemplateVariablesWithCache(templateVarsCache)

            return ret

        hitTemplateElems = self.Config.HitTemplate['Elements']

        for elem in hitTemplateElems:
            elem = self._stripWhitespaceAndReturnBeforeParsing(elem)
            templateSoup = BeautifulSoup(elem, self.bs4Parser)

            if self.bs4Parser == 'html5lib':
                templateRootTag = templateSoup.body.contents[0]
            else:
                templateRootTag = templateSoup.contents[0]

            if not type(templateRootTag) == element.Tag:
                # TODO: what do we do for this ?
                pass

            htmlContent = self._stripWhitespaceAndReturnBeforeParsing(html_content)
            htmlSoup = BeautifulSoup(htmlContent, self.bs4Parser)

            tagCandidates = htmlSoup.find_all(_searching_helper_func)
            for candiTag in tagCandidates:
                templateVarsCache = {}
                self._parseTagRecursive(candiTag, templateRootTag, templateVarsCache)

                if not len(templateVarsCache) == 0:
                    self._mergeTemplateVariablesWithCache(templateVarsCache)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号