util.py 文件源码-python代码片段

def unirange(a, b):
    """Returns a regular expression string to match the given non-BMP range."""
    if b < a:
        raise ValueError("Bad character range")
    if a < 0x10000 or b < 0x10000:
        raise ValueError("unirange is only defined for non-BMP ranges")

    if sys.maxunicode > 0xffff:
        # wide build
        return u'[%s-%s]' % (unichr(a), unichr(b))
    else:
        # narrow build stores surrogates, and the 're' module handles them
        # (incorrectly) as characters.  Since there is still ordering among
        # these characters, expand the range to one that it understands.  Some
        # background in http://bugs.python.org/issue3665 and
        # http://bugs.python.org/issue12749
        #
        # Additionally, the lower constants are using unichr rather than
        # literals because jython [which uses the wide path] can't load this
        # file if they are literals.
        ah, al = _surrogatepair(a)
        bh, bl = _surrogatepair(b)
        if ah == bh:
            return u'(?:%s[%s-%s])' % (unichr(ah), unichr(al), unichr(bl))
        else:
            buf = []
            buf.append(u'%s[%s-%s]' %
                       (unichr(ah), unichr(al),
                        ah == bh and unichr(bl) or unichr(0xdfff)))
            if ah - bh > 1:
                buf.append(u'[%s-%s][%s-%s]' %
                           unichr(ah+1), unichr(bh-1), unichr(0xdc00), unichr(0xdfff))
            if ah != bh:
                buf.append(u'%s[%s-%s]' %
                           (unichr(bh), unichr(0xdc00), unichr(bl)))

            return u'(?:' + u'|'.join(buf) + u')'