coq_install_celex.py 文件源码

python
阅读 18 收藏 0 点赞 0 评论 0

项目:coquery 作者: gkunter 项目源码 文件源码
def dia_to_unicode(s):
    """
    Translates a string that contains CELEX encodings of diacritics to a
    Unicode string.

    Parameters
    ----------
    s : string
        A string containing CELEX diacritics (see CELEX/english/eol/README
        for details)

    Returns
    -------
    s : string
        The corresponding unicode string
    """

    encoded_diacritics = {
        "#": "COMBINING ACUTE ACCENT",
        "`": "COMBINING GRAVE ACCENT",
        '"': "COMBINING DIAERESIS",
        "^": "COMBINING CIRCUMFLEX ACCENT",
        ",": "COMBINING CEDILLA",
        "~": "COMBINING TILDE",
        "@": "COMBINING RING ABOVE"}

    diacritic = None
    char_list = []
    for ch in s:
        if ch in encoded_diacritics:
            diacritic = unicodedata.lookup(encoded_diacritics[ch])
        else:
            char_list.append(ch)
            # add diacritics:
            if diacritic:
                char_list.append(diacritic)
                diacritic = None
    # join and normalize characters:
    unicode_string = unicodedata.normalize("NFC", "".join(char_list))
    return unicode_string
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号