def parse_html(html_file):
"""
Read the HTML file using lxml's HTML parser, but convert to Unicode
using Beautiful Soup's UnicodeDammit class.
Can raise LxmlError or TypeError if the file can't be opened or
parsed.
"""
unicode_html = UnicodeDammit(html_file, smart_quotes_to="html",
is_html=True)
if unicode_html.unicode_markup is None:
raise ValueError("no HTML provided")
if not unicode_html.unicode_markup:
raise ValueError("could not detect character encoding")
return lxml.html.fromstring(unicode_html.unicode_markup)
评论列表
文章目录