def __call__(self, doc, encoding='UTF-8'):
if isinstance(doc,
(str, bytes)):
doc = fromstring(bytes(bytearray(doc,
encoding=encoding)),
parser=HTMLParser(encoding=encoding))
if not isinstance(doc,
HtmlElement):
return None
for cls in self.EXTRACTORS:
extract = cls()
tags_ = extract(doc)
if tags_:
tags = []
for idx, tag in enumerate(tags_):
if idx < 2 and len(tag) > 16:
break
elif len(tag) < 16:
tags.append(tag)
else:
if tags:
logger.info('TagExtractor got tags %s',
tags)
return tags
评论列表
文章目录