main_text.py 文件源码

python
阅读 17 收藏 0 点赞 0 评论 0

项目:feature_engineering 作者: webeng 项目源码 文件源码
def _parse_tags(cls, html):

        excluded_tags = ['script', 'style', 'noscript', 'html', 'head', 'meta',
                         'link', 'body', 'input', 'form', 'a']
        minimum_text_node_length = 8

        y_data = []
        text_data = []
        tag_signatures = []

        soup = BeautifulSoup(html, 'html.parser')

        for tag in soup.findAll():

            path = '.'.join(reversed([p.name for p in tag.parentGenerator() if p]))
            tag_signature = '.'.join([path, tag.name])

            if (tag.name not in excluded_tags) and ('table' not in path):

                tag_text = []
                for text in tag.contents:
                    if isinstance(text, Comment):
                        continue
                    try:
                        text = text.strip()
                        aux = BeautifulSoup(text, 'html.parser')
                        if aux.find() is None:
                            tag_text.append(text)
                    except Exception, e:
                        pass

                tag_text = "\n".join(tag_text)

                if tag_text and len(tag_text) > minimum_text_node_length:
                    if tag_text not in text_data:

                        # Remove line returns and tabs
                        tag_text = cls._remove_chars(tag_text)
                        if tag_text:
                            y_data.append(len(tag_text))
                            text_data.append(tag_text)
                            tag_signatures.append(path)

        x = np.array(y_data)
        return x, text_data, tag_signatures
评论列表


问题


面经


文章

微信
公众号

扫码关注公众号