def _parse_tags(cls, html):
excluded_tags = ['script', 'style', 'noscript', 'html', 'head', 'meta',
'link', 'body', 'input', 'form', 'a']
minimum_text_node_length = 8
y_data = []
text_data = []
tag_signatures = []
soup = BeautifulSoup(html, 'html.parser')
for tag in soup.findAll():
path = '.'.join(reversed([p.name for p in tag.parentGenerator() if p]))
tag_signature = '.'.join([path, tag.name])
if (tag.name not in excluded_tags) and ('table' not in path):
tag_text = []
for text in tag.contents:
if isinstance(text, Comment):
continue
try:
text = text.strip()
aux = BeautifulSoup(text, 'html.parser')
if aux.find() is None:
tag_text.append(text)
except Exception, e:
pass
tag_text = "\n".join(tag_text)
if tag_text and len(tag_text) > minimum_text_node_length:
if tag_text not in text_data:
# Remove line returns and tabs
tag_text = cls._remove_chars(tag_text)
if tag_text:
y_data.append(len(tag_text))
text_data.append(tag_text)
tag_signatures.append(path)
x = np.array(y_data)
return x, text_data, tag_signatures
评论列表
文章目录