def cleanup_command_line():
if not sys.stdin.encoding or sys.stdin.encoding == 'ascii':
return
conversion_pairs = {
'EN DASH': '-',
'EM DASH': '--',
'LEFT DOUBLE QUOTATION MARK': '"',
'RIGHT DOUBLE QUOTATION MARK': '"',
'LEFT SINGLE QUOTATION MARK': "'",
'RIGHT SINGLE QUOTATION MARK': "'",
}
for i in range(len(sys.argv)):
# create a unicode string with the decoded contents of the corresponding
# sys.argv string
decoded = unicode(sys.argv[i], sys.stdin.encoding)
for key, val in conversion_pairs.iteritems():
decoded = unicode.replace(decoded, unicodedata.lookup(key), val)
# Should we be doing 'strict' here instead of 'replace'?
sys.argv[i] = decoded.encode(sys.stdin.encoding, 'replace')
python类lookup()的实例源码
def parse_repl_named_char(source):
"Parses a named character in a replacement string."
saved_pos = source.pos
if source.match("{"):
name = source.get_while(ALPHA | set(" "))
if source.match("}"):
try:
value = unicodedata.lookup(name)
return ord(value)
except KeyError:
raise error("undefined character name", source.string,
source.pos)
source.pos = saved_pos
return None
def unicode_name_matches(self, text):
u"""Match Latex-like syntax for unicode characters base
on the name of the character.
This does ``\\GREEK SMALL LETTER ETA`` -> ``?``
Works only on valid python 3 identifier, or on combining characters that
will combine to form a valid identifier.
Used on Python 3 only.
"""
slashpos = text.rfind('\\')
if slashpos > -1:
s = text[slashpos+1:]
try :
unic = unicodedata.lookup(s)
# allow combining chars
if ('a'+unic).isidentifier():
return '\\'+s,[unic]
except KeyError:
pass
return u'', []
def _token_splittable(token):
"""
Predicate for whether a token name can be split into multiple tokens.
A token is splittable if it does not contain an underscore character and
it is not the name of a Greek letter. This is used to implicitly convert
expressions like 'xyz' into 'x*y*z'.
"""
if '_' in token:
return False
else:
try:
return not unicodedata.lookup('GREEK SMALL LETTER ' + token)
except KeyError:
pass
if len(token) > 1:
return True
return False
def __init__(self, msg='', maxspin=0, minspin=10, speed=5):
# Count of a spin
self.count = 0
self.out = sys.stdout
self.flag = False
self.max = maxspin
self.min = minspin
# Any message to print first ?
self.msg = msg
# Complete printed string
self.string = ''
# Speed is given as number of spins a second
# Use it to calculate spin wait time
self.waittime = 1.0 / float(speed * 4)
if os.name == 'posix':
self.spinchars = (unicodedata.lookup('FIGURE DASH'), u'\\ ', u'| ', u'/ ')
else:
# The unicode dash character does not show
# up properly in Windows console.
self.spinchars = (u'-', u'\\ ', u'| ', u'/ ')
threading.Thread.__init__(self, None, None, "Spin Thread")
def escape(m):
all, tail = m.group(0, 1)
assert all.startswith("\\")
esc = simple_escapes.get(tail)
if esc is not None:
return esc
elif tail.startswith("x"):
return chr(convert_hex(tail, 2))
elif tail.startswith('u'):
return unichr(convert_hex(tail, 4))
elif tail.startswith('U'):
return unichr(convert_hex(tail, 8))
elif tail.startswith('N'):
import unicodedata
try:
return unicodedata.lookup(tail[1:-1])
except KeyError:
raise ValueError("undefined character name %r" % tail[1:-1])
else:
try:
return chr(int(tail, 8))
except ValueError:
raise ValueError("invalid octal string escape ('\\%s')" % tail)
def test_aliases(self):
# Check that the aliases defined in the NameAliases.txt file work.
# This should be updated when new aliases are added or the file
# should be downloaded and parsed instead. See #12753.
aliases = [
('LATIN CAPITAL LETTER GHA', 0x01A2),
('LATIN SMALL LETTER GHA', 0x01A3),
('KANNADA LETTER LLLA', 0x0CDE),
('LAO LETTER FO FON', 0x0E9D),
('LAO LETTER FO FAY', 0x0E9F),
('LAO LETTER RO', 0x0EA3),
('LAO LETTER LO', 0x0EA5),
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
('YI SYLLABLE ITERATION MARK', 0xA015),
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
]
for alias, codepoint in aliases:
self.checkletter(alias, chr(codepoint))
name = unicodedata.name(chr(codepoint))
self.assertNotEqual(name, alias)
self.assertEqual(unicodedata.lookup(alias),
unicodedata.lookup(name))
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(alias)
def test_named_sequences_full(self):
# Check all the named sequences
url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
unicodedata.unidata_version)
try:
testdata = support.open_urlresource(url, encoding="utf-8",
check=check_version)
except (IOError, HTTPException):
self.skipTest("Could not retrieve " + url)
self.addCleanup(testdata.close)
for line in testdata:
line = line.strip()
if not line or line.startswith('#'):
continue
seqname, codepoints = line.split(';')
codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
self.assertEqual(unicodedata.lookup(seqname), codepoints)
with self.assertRaises(SyntaxError):
self.checkletter(seqname, None)
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)
def start_unichar(self, attr):
if 'name' in attr:
if 'code' in attr:
self._syntax_error('<unichar/> invalid with both name and code attributes')
try:
v = unicodedata.lookup(attr['name'])
except KeyError:
self._syntax_error('<unichar/> invalid name attribute\n"%s"' % ascii(attr['name']))
v = '\0'
elif 'code' in attr:
try:
v = int(eval(attr['code']))
v = chr(v) if isPy3 else unichr(v)
except:
self._syntax_error('<unichar/> invalid code attribute %s' % ascii(attr['code']))
v = '\0'
else:
v = None
if attr:
self._syntax_error('<unichar/> invalid attribute %s' % list(attr.keys())[0])
if v is not None:
self.handle_data(v)
self._push('unichar',_selfClosingTag='unichar')
def unicode_name_matches(self, text):
u"""Match Latex-like syntax for unicode characters base
on the name of the character.
This does \\GREEK SMALL LETTER ETA -> ?
Works only on valid python 3 identifier, or on combining characters that
will combine to form a valid identifier.
Used on Python 3 only.
"""
slashpos = text.rfind('\\')
if slashpos > -1:
s = text[slashpos+1:]
try :
unic = unicodedata.lookup(s)
# allow combining chars
if ('a'+unic).isidentifier():
return '\\'+s,[unic]
except KeyError:
pass
return u'', []
def test_aliases(self):
# Check that the aliases defined in the NameAliases.txt file work.
# This should be updated when new aliases are added or the file
# should be downloaded and parsed instead. See #12753.
aliases = [
('LATIN CAPITAL LETTER GHA', 0x01A2),
('LATIN SMALL LETTER GHA', 0x01A3),
('KANNADA LETTER LLLA', 0x0CDE),
('LAO LETTER FO FON', 0x0E9D),
('LAO LETTER FO FAY', 0x0E9F),
('LAO LETTER RO', 0x0EA3),
('LAO LETTER LO', 0x0EA5),
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
('YI SYLLABLE ITERATION MARK', 0xA015),
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
]
for alias, codepoint in aliases:
self.checkletter(alias, chr(codepoint))
name = unicodedata.name(chr(codepoint))
self.assertNotEqual(name, alias)
self.assertEqual(unicodedata.lookup(alias),
unicodedata.lookup(name))
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(alias)
def test_named_sequences_full(self):
# Check all the named sequences
url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
unicodedata.unidata_version)
try:
testdata = support.open_urlresource(url, encoding="utf-8",
check=check_version)
except (OSError, HTTPException):
self.skipTest("Could not retrieve " + url)
self.addCleanup(testdata.close)
for line in testdata:
line = line.strip()
if not line or line.startswith('#'):
continue
seqname, codepoints = line.split(';')
codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
self.assertEqual(unicodedata.lookup(seqname), codepoints)
with self.assertRaises(SyntaxError):
self.checkletter(seqname, None)
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)
def _greekletters(letterlist):
for l in letterlist:
ucharname = l.upper()
if (ucharname == 'LAMBDA'):
ucharname = 'LAMDA'
smallname = "GREEK SMALL LETTER "+ucharname;
if (ucharname == 'EPSILON'):
smallname = "GREEK LUNATE EPSILON SYMBOL"
if (ucharname == 'PHI'):
smallname = "GREEK PHI SYMBOL"
_default_macro_list.append(
(l, unicodedata.lookup(smallname))
);
_default_macro_list.append(
(l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname))
);
def _check_files():
if not dataIO.is_valid_json(TRIGGERS_PATH):
_LOGGER.info("Creating json: " + TRIGGERS_PATH)
dataIO.save_json(TRIGGERS_PATH, DEFAULT_SETTINGS)
else: # Backwards compatibility check
triggers = dataIO.load_json(TRIGGERS_PATH)
for text, emoji_list in triggers['text_triggers'].items():
for idx, emoji in enumerate(emoji_list):
try:
emoji = lookup(emoji)
except KeyError:
pass
else:
emoji_list[idx] = emoji
triggers['text_triggers'][text] = emoji_list
for user, emoji_list in triggers['user_triggers'].items():
for idx, emoji in enumerate(emoji_list):
try:
emoji = lookup(emoji)
except KeyError:
pass
else:
emoji_list[idx] = emoji
triggers['user_triggers'][user] = emoji_list
dataIO.save_json(TRIGGERS_PATH, triggers)
def _token_splittable(token):
"""
Predicate for whether a token name can be split into multiple tokens.
A token is splittable if it does not contain an underscore character and
it is not the name of a Greek letter. This is used to implicitly convert
expressions like 'xyz' into 'x*y*z'.
"""
if '_' in token:
return False
else:
try:
return not unicodedata.lookup('GREEK SMALL LETTER ' + token)
except KeyError:
pass
if len(token) > 1:
return True
return False
def test_aliases(self):
# Check that the aliases defined in the NameAliases.txt file work.
# This should be updated when new aliases are added or the file
# should be downloaded and parsed instead. See #12753.
aliases = [
('LATIN CAPITAL LETTER GHA', 0x01A2),
('LATIN SMALL LETTER GHA', 0x01A3),
('KANNADA LETTER LLLA', 0x0CDE),
('LAO LETTER FO FON', 0x0E9D),
('LAO LETTER FO FAY', 0x0E9F),
('LAO LETTER RO', 0x0EA3),
('LAO LETTER LO', 0x0EA5),
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
('YI SYLLABLE ITERATION MARK', 0xA015),
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
]
for alias, codepoint in aliases:
self.checkletter(alias, chr(codepoint))
name = unicodedata.name(chr(codepoint))
self.assertNotEqual(name, alias)
self.assertEqual(unicodedata.lookup(alias),
unicodedata.lookup(name))
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(alias)
def test_named_sequences_full(self):
# Check all the named sequences
url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
unicodedata.unidata_version)
try:
testdata = support.open_urlresource(url, encoding="utf-8",
check=check_version)
except (OSError, HTTPException):
self.skipTest("Could not retrieve " + url)
self.addCleanup(testdata.close)
for line in testdata:
line = line.strip()
if not line or line.startswith('#'):
continue
seqname, codepoints = line.split(';')
codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
self.assertEqual(unicodedata.lookup(seqname), codepoints)
with self.assertRaises(SyntaxError):
self.checkletter(seqname, None)
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)
def unicode_name_matches(self, text):
u"""Match Latex-like syntax for unicode characters base
on the name of the character.
This does \\GREEK SMALL LETTER ETA -> ?
Works only on valid python 3 identifier, or on combining characters that
will combine to form a valid identifier.
Used on Python 3 only.
"""
slashpos = text.rfind('\\')
if slashpos > -1:
s = text[slashpos+1:]
try :
unic = unicodedata.lookup(s)
# allow combining chars
if ('a'+unic).isidentifier():
return '\\'+s,[unic]
except KeyError:
pass
return u'', []
def unicode_name_matches(self, text):
u"""Match Latex-like syntax for unicode characters base
on the name of the character.
This does ``\\GREEK SMALL LETTER ETA`` -> ``?``
Works only on valid python 3 identifier, or on combining characters that
will combine to form a valid identifier.
Used on Python 3 only.
"""
slashpos = text.rfind('\\')
if slashpos > -1:
s = text[slashpos+1:]
try :
unic = unicodedata.lookup(s)
# allow combining chars
if ('a'+unic).isidentifier():
return '\\'+s,[unic]
except KeyError:
pass
return u'', []
def insert_accented(self, c, accent):
if c.isalpha():
if c.isupper():
cap = 'capital'
else:
cap = 'small'
try:
c = lookup("latin %s letter %c with %s" % (cap, c, accent))
self.insert(INSERT, c)
# Prevent plain letter from being inserted too, tell Tk to
# stop handling this event
return "break"
except KeyError, e:
pass
def expand_unicode(s):
""" Convert unicode reference in to a Unicode string. """
if s.startswith(r'\u') or s.startswith(r'\U'):
return chr(int(s,16))
if s.startswith(r'\N{'):
name = s[3:-1]
try:
return unicodedata.lookup(name)
except:
raise ConfigError("Failed to find unicode value with name {}\n".format(name))
else:
return s
def u(s):
"""Generate Unicode string from a string input, encoding Unicode characters.
This is expected to work in the same way as u'<string>' would work in Python
2.x (although it is not completely robust as it is based on a simple set of
regexps).
"""
us = re.sub(_U16_RE, lambda m: unichr(int(m.group('hexval'), 16)), unicode(s))
us = re.sub(_U32_RE, lambda m: unichr(int(m.group('hexval'), 16)), us)
us = re.sub(_UNAME_RE, lambda m: unicodedata.lookup(m.group('name')), us)
return us
def u(s):
"""Generate Unicode string from a string input, encoding Unicode characters.
This is expected to work in the same way as u'<string>' would work in Python
2.x (although it is not completely robust as it is based on a simple set of
regexps).
"""
us = re.sub(_U16_RE, lambda m: unichr(int(m.group('hexval'), 16)), unicode(s))
us = re.sub(_U32_RE, lambda m: unichr(int(m.group('hexval'), 16)), us)
us = re.sub(_UNAME_RE, lambda m: unicodedata.lookup(m.group('name')), us)
return us
def dia_to_unicode(s):
"""
Translates a string that contains CELEX encodings of diacritics to a
Unicode string.
Parameters
----------
s : string
A string containing CELEX diacritics (see CELEX/english/eol/README
for details)
Returns
-------
s : string
The corresponding unicode string
"""
encoded_diacritics = {
"#": "COMBINING ACUTE ACCENT",
"`": "COMBINING GRAVE ACCENT",
'"': "COMBINING DIAERESIS",
"^": "COMBINING CIRCUMFLEX ACCENT",
",": "COMBINING CEDILLA",
"~": "COMBINING TILDE",
"@": "COMBINING RING ABOVE"}
diacritic = None
char_list = []
for ch in s:
if ch in encoded_diacritics:
diacritic = unicodedata.lookup(encoded_diacritics[ch])
else:
char_list.append(ch)
# add diacritics:
if diacritic:
char_list.append(diacritic)
diacritic = None
# join and normalize characters:
unicode_string = unicodedata.normalize("NFC", "".join(char_list))
return unicode_string
def parse_named_char(source, info, in_set):
"Parses a named character."
saved_pos = source.pos
if source.match("{"):
name = source.get_while(NAMED_CHAR_PART)
if source.match("}"):
try:
value = unicodedata.lookup(name)
return make_character(info, ord(value), in_set)
except KeyError:
raise error("undefined character name", source.string,
source.pos)
source.pos = saved_pos
return make_character(info, ord("N"), in_set)
def unicode_name(self, name):
"""Insert Unicode value by its name."""
value = ord(unicodedata.lookup(name))
return '\\%03o' % value if value <= 0xFF else compat.uchr(value)
def U(name):
"""unicode character by name or None if not found"""
try:
u = unicodedata.lookup(name)
except KeyError:
u = None
global unicode_warnings
unicode_warnings += 'No \'%s\' in unicodedata\n' % name
return u
def test_ascii_letters(self):
import unicodedata
for char in "".join(map(chr, range(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
self.assertEqual(unicodedata.name(code), name)
def test_bmp_characters(self):
import unicodedata
count = 0
for code in range(0x10000):
char = chr(code)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
count += 1
def test_errors(self):
import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, 'xx')
self.assertRaises(TypeError, unicodedata.lookup)
self.assertRaises(KeyError, unicodedata.lookup, 'unknown')