def process_post_bodies(bodies: List[Tag]) -> (str, list):
for body in bodies:
cites = list()
cited = body.findAll('div', {'class': 'cite'})
if cited:
cites = [c['name'] for c in cited]
collect_text = []
for tag in body:
# TODO: This is a suboptimal(and partially wrong) solution to parse cites in post body (a lot to improve here)
if tag.name not in ('div', 'p'):
if hasattr(tag, 'text'):
collect_text.append(tag.text)
elif isinstance(tag, NavigableString):
collect_text.append(str(tag))
else:
collect_text.append('\n')
else:
yield ''.join(collect_text), cites
评论列表
文章目录