def dia_to_unicode(s):
"""
Translates a string that contains CELEX encodings of diacritics to a
Unicode string.
Parameters
----------
s : string
A string containing CELEX diacritics (see CELEX/english/eol/README
for details)
Returns
-------
s : string
The corresponding unicode string
"""
encoded_diacritics = {
"#": "COMBINING ACUTE ACCENT",
"`": "COMBINING GRAVE ACCENT",
'"': "COMBINING DIAERESIS",
"^": "COMBINING CIRCUMFLEX ACCENT",
",": "COMBINING CEDILLA",
"~": "COMBINING TILDE",
"@": "COMBINING RING ABOVE"}
diacritic = None
char_list = []
for ch in s:
if ch in encoded_diacritics:
diacritic = unicodedata.lookup(encoded_diacritics[ch])
else:
char_list.append(ch)
# add diacritics:
if diacritic:
char_list.append(diacritic)
diacritic = None
# join and normalize characters:
unicode_string = unicodedata.normalize("NFC", "".join(char_list))
return unicode_string
评论列表
文章目录