parser.py 文件源码-python代码片段

parser.py 文件源码

python

阅读 28 收藏 0 点赞 0 评论 0

def process_post_bodies(bodies: List[Tag]) -> (str, list):
        for body in bodies:
            cites = list()
            cited = body.findAll('div', {'class': 'cite'})
            if cited:
                cites = [c['name'] for c in cited]
            collect_text = []
            for tag in body:
                # TODO: This is a suboptimal(and partially wrong) solution to parse cites in post body (a lot to improve here)
                if tag.name not in ('div', 'p'):
                    if hasattr(tag, 'text'):
                        collect_text.append(tag.text)
                    elif isinstance(tag, NavigableString):
                        collect_text.append(str(tag))
                    else:
                        collect_text.append('\n')
            else:
                yield ''.join(collect_text), cites