def html_to_markdown(html):
"""convert html to markdown.
this will try and convert span styling
to the proper tags as well.
e.g. `<span style='font-weight:bold;'>foo</span>`
will become `<strong>foo</strong>`.
"""
h = fromstring(html)
clean_highlighted_code(h)
for span in h.findall('.//span') + h.findall('.//font'):
convert_span(span)
html = tostring(h).decode('utf-8')
# not ideal but works in a pinch
html = html.replace('<mark>', '==')
html = html.replace('</mark>', '==')
md = to_md(html)
# sometimes html2text returns a ton of extra whitespace.
# clean up lines with only whitespace.
# condense line break streaks of 3 or more.
md = re.sub(r'\n([\s\*_]+)\n', '\n\n', md)
md = re.sub(r'\n{3,}', '\n\n', md)
return md
评论列表
文章目录