main_text.py 文件源码-python代码片段

def _parse_tags(cls, html):

        excluded_tags = ['script', 'style', 'noscript', 'html', 'head', 'meta',
                         'link', 'body', 'input', 'form', 'a']
        minimum_text_node_length = 8

        y_data = []
        text_data = []
        tag_signatures = []

        soup = BeautifulSoup(html, 'html.parser')

        for tag in soup.findAll():

            path = '.'.join(reversed([p.name for p in tag.parentGenerator() if p]))
            tag_signature = '.'.join([path, tag.name])

            if (tag.name not in excluded_tags) and ('table' not in path):

                tag_text = []
                for text in tag.contents:
                    if isinstance(text, Comment):
                        continue
                    try:
                        text = text.strip()
                        aux = BeautifulSoup(text, 'html.parser')
                        if aux.find() is None:
                            tag_text.append(text)
                    except Exception, e:
                        pass

                tag_text = "\n".join(tag_text)

                if tag_text and len(tag_text) > minimum_text_node_length:
                    if tag_text not in text_data:

                        # Remove line returns and tabs
                        tag_text = cls._remove_chars(tag_text)
                        if tag_text:
                            y_data.append(len(tag_text))
                            text_data.append(tag_text)
                            tag_signatures.append(path)

        x = np.array(y_data)
        return x, text_data, tag_signatures