def ParseHtmlContent(self, html_content):
def _searching_helper_func(tag):
templateVarsCache = {}
ret = self._censorTagCandidateWithTemplate(tag, templateRootTag, templateVarsCache)
if ret is True:
self._mergeTemplateVariablesWithCache(templateVarsCache)
return ret
hitTemplateElems = self.Config.HitTemplate['Elements']
for elem in hitTemplateElems:
elem = self._stripWhitespaceAndReturnBeforeParsing(elem)
templateSoup = BeautifulSoup(elem, self.bs4Parser)
if self.bs4Parser == 'html5lib':
templateRootTag = templateSoup.body.contents[0]
else:
templateRootTag = templateSoup.contents[0]
if not type(templateRootTag) == element.Tag:
# TODO: what do we do for this ?
pass
htmlContent = self._stripWhitespaceAndReturnBeforeParsing(html_content)
htmlSoup = BeautifulSoup(htmlContent, self.bs4Parser)
tagCandidates = htmlSoup.find_all(_searching_helper_func)
for candiTag in tagCandidates:
templateVarsCache = {}
self._parseTagRecursive(candiTag, templateRootTag, templateVarsCache)
if not len(templateVarsCache) == 0:
self._mergeTemplateVariablesWithCache(templateVarsCache)
评论列表
文章目录