def simplify_quotes(text):
""" Even though UnicodeDammit smart_quotes_to="ascii" takes care of many cases, some crap can still be left...
In addition to the smart-quotes, on *output* we also want to catch the case of `` -> " and '' -> "
(NLTK has some tokenizers that convert like that).
So, this can be used in the input cleaners chain, AFTER UnicodeDammit; it can also be used from OutputProofreader.
>>> text = b'Have some ``weird" “quotes” and curlies,” won’t you please. Quotes are ‘fun’'.decode('utf8')
>>> print simplify_quotes(text)
Have some "weird" "quotes" and curlies," won't you please. Quotes are 'fun'
>>> print simplify_quotes(unichr(8220) + u"foo" + unichr(8221) + unichr(8216) + u"bar" + unichr(8217))
"foo"'bar'
>>> text = b'``weird" “quotes” aren’t very ‘fun’ I don’t think'.decode('utf8')
>>> print simplify_quotes(text)
"weird" "quotes" aren't very 'fun' I don't think
"""
return (text
.replace(u"``", u'"')
.replace(u"''", u'"')
.replace(u'“', u'"')
.replace(u'”', u'"')
.replace(u'’', u"'")
.replace(u'‘', u"'"))
评论列表
文章目录