lispider.py 文件源码

python
阅读 25 收藏 0 点赞 0 评论 0

项目:LiSpider 作者: jay7n 项目源码 文件源码
def _censorTagCandidateWithTemplate(self, candi_tag, template_tag, template_var_cache):
        if not type(candi_tag) == element.Tag or not type(template_tag) == element.Tag:
            return False

        if not candi_tag.name == template_tag.name:
            self.logger.debug('tag name inequality: \'%s\' is not equal to \'%s\'',
                              candi_tag.name, template_tag.name)
            return False

        for tmpAttrKey, tmpAttrValue in getDictIterItems(template_tag.attrs):
            if tmpAttrValue == '%%':
                # this means an empty variable,
                # indicating that it is expected to be ignored.
                continue

            if not candi_tag.has_attr(tmpAttrKey):
                self.logger.debug(candi_tag)
                self.logger.debug('tag attr not exsits: no attr \'%s\' in \'%s\'',
                                  tmpAttrKey, candi_tag.name)
                return False

            candiAttrValue = candi_tag[tmpAttrKey]

            if tmpAttrKey == 'class':
                tmpAttrValue = ' '.join(tmpAttrValue)
                candiAttrValue = ' '.join(candiAttrValue)

            matchObj = self.RegPattern.search(tmpAttrValue)

            if matchObj is not None:
                varName = matchObj.group(1)
                varValue = candiAttrValue
                self._procTemplateVariable(varName, varValue, template_var_cache)

            elif not tmpAttrValue == candiAttrValue:
                self.logger.debug(candi_tag)
                self.logger.debug('tag attr inequality: \'%s\' is not equal to \'%s\' in \'%s\'',
                                  tmpAttrValue, candiAttrValue, candi_tag.name)
                return False

        return True
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号