clean.py 文件源码-python代码片段

def simplify_quotes(text):
    """ Even though UnicodeDammit smart_quotes_to="ascii" takes care of many cases, some crap can still be left...

    In addition to the smart-quotes, on *output* we also want to catch the case of `` -> " and '' -> "
    (NLTK has some tokenizers that convert like that).

    So, this can be used in the input cleaners chain, AFTER UnicodeDammit; it can also be used from OutputProofreader.

        >>> text = b'Have some ``weird" “quotes” and curlies,”  won’t you please. Quotes are ‘fun’'.decode('utf8')
        >>> print simplify_quotes(text)
        Have some "weird" "quotes" and curlies,"  won't you please. Quotes are 'fun'
        >>> print simplify_quotes(unichr(8220) + u"foo" + unichr(8221) + unichr(8216) + u"bar" + unichr(8217))
        "foo"'bar'
        >>> text = b'``weird" “quotes” aren’t very ‘fun’ I don’t think'.decode('utf8')
        >>> print simplify_quotes(text)
        "weird" "quotes" aren't very 'fun' I don't think
    """
    return (text
            .replace(u"``", u'"')
            .replace(u"''", u'"')
            .replace(u'“', u'"')
            .replace(u'”', u'"')
            .replace(u'’', u"'")
            .replace(u'‘', u"'"))