def unicode_dammit(s, override_encodings=('utf-8', 'windows-1252', 'iso-8859-1', 'latin-1'), smart_quotes_to="ascii"):
""" using bs4.UnicodeDammit, "coerce" text to unicode. replaces (some) 'smart quotes'. fixes (some) mixed encodings
What's it do under the hood? The docs explain some, the source explains even more of course.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit
>>> with_smart_quotes = b"I just \x93love\x94 your word processor\x92s smart quotes"
>>> assert unicode_dammit(with_smart_quotes) == 'I just "love" your word processor\\'s smart quotes'
:param override_encodings: why these defaults - in short, they are commonly seen in input texts I've played with.
whether they are mixed or not. someday-maybe this can be configured with better control if needed.
"""
cleaned = UnicodeDammit(s, smart_quotes_to=smart_quotes_to, override_encodings=override_encodings).unicode_markup
return cleaned
评论列表
文章目录