def parse_rsc_html(htmlstring):
"""Messy RSC HTML needs this special parser to fix problems before creating selector."""
converted = UnicodeDammit(htmlstring)
if not converted.unicode_markup:
raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
# Add p.otherpara tags around orphan text
newp = None
for child in root.get_element_by_id('wrapper'):
if newp is not None:
if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
child.addprevious(newp)
newp = None
else:
newp.append(child)
if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
newp = Element('p', **{'class': 'otherpara'})
newp.text = child.tail
child.tail = ''
return root
评论列表
文章目录