def unirange(a, b):
"""Returns a regular expression string to match the given non-BMP range."""
if b < a:
raise ValueError("Bad character range")
if a < 0x10000 or b < 0x10000:
raise ValueError("unirange is only defined for non-BMP ranges")
if sys.maxunicode > 0xffff:
# wide build
return u'[%s-%s]' % (unichr(a), unichr(b))
else:
# narrow build stores surrogates, and the 're' module handles them
# (incorrectly) as characters. Since there is still ordering among
# these characters, expand the range to one that it understands. Some
# background in http://bugs.python.org/issue3665 and
# http://bugs.python.org/issue12749
#
# Additionally, the lower constants are using unichr rather than
# literals because jython [which uses the wide path] can't load this
# file if they are literals.
ah, al = _surrogatepair(a)
bh, bl = _surrogatepair(b)
if ah == bh:
return u'(?:%s[%s-%s])' % (unichr(ah), unichr(al), unichr(bl))
else:
buf = []
buf.append(u'%s[%s-%s]' %
(unichr(ah), unichr(al),
ah == bh and unichr(bl) or unichr(0xdfff)))
if ah - bh > 1:
buf.append(u'[%s-%s][%s-%s]' %
unichr(ah+1), unichr(bh-1), unichr(0xdc00), unichr(0xdfff))
if ah != bh:
buf.append(u'%s[%s-%s]' %
(unichr(bh), unichr(0xdc00), unichr(bl)))
return u'(?:' + u'|'.join(buf) + u')'
评论列表
文章目录