python类normalize()的实例源码

bottle.py 文件源码 项目:Mmrz-Sync 作者: zhanglintc 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def filename(self):
        ''' Name of the file on the client file system, but normalized to ensure
            file system compatibility. An empty filename is returned as 'empty'.

            Only ASCII letters, digits, dashes, underscores and dots are
            allowed in the final filename. Accents are removed, if possible.
            Whitespace is replaced by a single dash. Leading or tailing dots
            or dashes are removed. The filename is limited to 255 characters.
        '''
        fname = self.raw_filename
        if not isinstance(fname, unicode):
            fname = fname.decode('utf8', 'ignore')
        fname = normalize('NFKD', fname).encode('ASCII', 'ignore').decode('ASCII')
        fname = os.path.basename(fname.replace('\\', os.path.sep))
        fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
        fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
        return fname[:255] or 'empty'
normalize.py 文件源码 项目:ChemDataExtractor 作者: mcs07 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def __init__(self, form='NFKC', strip=True, collapse=True, hyphens=False, quotes=False, ellipsis=False,
                 slashes=False, tildes=False):
        """

        :param string form: Normal form for unicode normalization.
        :param bool strip: Whether to strip whitespace from start and end.
        :param bool collapse: Whether to collapse all whitespace (tabs, newlines) down to single spaces.
        :param bool hyphens: Whether to normalize all hyphens, minuses and dashes to the ASCII hyphen-minus character.
        :param bool quotes: Whether to normalize all apostrophes, quotes and primes to the ASCII quote character.
        :param bool ellipsis: Whether to normalize ellipses to three full stops.
        :param bool slashes: Whether to normalize slash characters to the ASCII slash character.
        :param bool tildes: Whether to normalize tilde characters to the ASCII tilde character.
        """
        self.form = form
        self.strip = strip
        self.collapse = collapse
        self.hyphens = hyphens
        self.quotes = quotes
        self.ellipsis = ellipsis
        self.slashes = slashes
        self.tildes = tildes
utils.py 文件源码 项目:plugin.audio.spotify 作者: marcelveldt 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def normalize_string(text):
    import unicodedata
    text = text.replace(":", "")
    text = text.replace("/", "-")
    text = text.replace("\\", "-")
    text = text.replace("<", "")
    text = text.replace(">", "")
    text = text.replace("*", "")
    text = text.replace("?", "")
    text = text.replace('|', "")
    text = text.replace('(', "")
    text = text.replace(')', "")
    text = text.replace("\"", "")
    text = text.strip()
    text = text.rstrip('.')
    text = unicodedata.normalize('NFKD', try_decode(text))
    return text
bottle.py 文件源码 项目:ynm3k 作者: socrateslee 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def filename(self):
        ''' Name of the file on the client file system, but normalized to ensure
            file system compatibility. An empty filename is returned as 'empty'.

            Only ASCII letters, digits, dashes, underscores and dots are
            allowed in the final filename. Accents are removed, if possible.
            Whitespace is replaced by a single dash. Leading or tailing dots
            or dashes are removed. The filename is limited to 255 characters.
        '''
        fname = self.raw_filename
        if not isinstance(fname, unicode):
            fname = fname.decode('utf8', 'ignore')
        fname = normalize('NFKD', fname).encode('ASCII', 'ignore').decode('ASCII')
        fname = os.path.basename(fname.replace('\\', os.path.sep))
        fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
        fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
        return fname[:255] or 'empty'
workflow.py 文件源码 项目:alfred-workflows 作者: arthurhammer 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore'))
workflow.py 文件源码 项目:alfred-zebra 作者: r0x73 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore'))
bottle.py 文件源码 项目:warriorframework 作者: warriorframework 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def filename(self):
        """ Name of the file on the client file system, but normalized to ensure
            file system compatibility. An empty filename is returned as 'empty'.

            Only ASCII letters, digits, dashes, underscores and dots are
            allowed in the final filename. Accents are removed, if possible.
            Whitespace is replaced by a single dash. Leading or tailing dots
            or dashes are removed. The filename is limited to 255 characters.
        """
        fname = self.raw_filename
        if not isinstance(fname, unicode):
            fname = fname.decode('utf8', 'ignore')
        fname = normalize('NFKD', fname).encode('ASCII', 'ignore').decode('ASCII')
        fname = os.path.basename(fname.replace('\\', os.path.sep))
        fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
        fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
        return fname[:255] or 'empty'
bottle.py 文件源码 项目:warriorframework 作者: warriorframework 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def filename(self):
        """ Name of the file on the client file system, but normalized to ensure
            file system compatibility. An empty filename is returned as 'empty'.

            Only ASCII letters, digits, dashes, underscores and dots are
            allowed in the final filename. Accents are removed, if possible.
            Whitespace is replaced by a single dash. Leading or tailing dots
            or dashes are removed. The filename is limited to 255 characters.
        """
        fname = self.raw_filename
        if not isinstance(fname, unicode):
            fname = fname.decode('utf8', 'ignore')
        fname = normalize('NFKD', fname)
        fname = fname.encode('ASCII', 'ignore').decode('ASCII')
        fname = os.path.basename(fname.replace('\\', os.path.sep))
        fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
        fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
        return fname[:255] or 'empty'
validators.py 文件源码 项目:touch-pay-client 作者: HackPucBemobi 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def urlify(s, maxlen=80, keep_underscores=False):
    """
    Converts incoming string to a simplified ASCII subset.
    if (keep_underscores): underscores are retained in the string
    else: underscores are translated to hyphens (default)
    """
    s = to_unicode(s)                     # to unicode
    s = s.lower()                         # to lowercase
    s = unicodedata.normalize('NFKD', s)  # replace special characters
    s = to_native(s, charset='ascii', errors='ignore')       # encode as ASCII
    s = re.sub('&\w+?;', '', s)           # strip html entities
    if keep_underscores:
        s = re.sub('\s+', '-', s)         # whitespace to hyphens
        s = re.sub('[^\w\-]', '', s)
        # strip all but alphanumeric/underscore/hyphen
    else:
        s = re.sub('[\s_]+', '-', s)      # whitespace & underscores to hyphens
        s = re.sub('[^a-z0-9\-]', '', s)  # strip all but alphanumeric/hyphen
    s = re.sub('[-_][-_]+', '-', s)       # collapse strings of hyphens
    s = s.strip('-')                      # remove leading and trailing hyphens
    return s[:maxlen]                     # enforce maximum length
thesaurus.py 文件源码 项目:ln2sql 作者: FerreroJeremy 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
stopwordFilter.py 文件源码 项目:ln2sql 作者: FerreroJeremy 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
parser.py 文件源码 项目:ln2sql 作者: FerreroJeremy 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
langConfig.py 文件源码 项目:ln2sql 作者: FerreroJeremy 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
__init__.py 文件源码 项目:alelo_ofx 作者: dantas 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def _convert_transaction(transaction):
    date = transaction['date'].strftime("%Y%m%d%H%M%S")
    return dict2xml.convert("STMTTRN", {
        "DTPOSTED": date,
        "FITID": date,
        "TRNAMT": transaction['signal'] + transaction['amount'],
        "MEMO": unicodedata.normalize('NFD',
            transaction['description']).encode('ascii', 'ignore'),
    })
workflow.py 文件源码 项目:alfred-mpd 作者: deanishe 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def decode(self, text, encoding=None, normalization=None):
        """Return ``text`` as normalised unicode.

        If ``encoding`` and/or ``normalization`` is ``None``, the
        ``input_encoding``and ``normalization`` parameters passed to
        :class:`Workflow` are used.

        :param text: string
        :type text: encoded or Unicode string. If ``text`` is already a
            Unicode string, it will only be normalised.
        :param encoding: The text encoding to use to decode ``text`` to
            Unicode.
        :type encoding: ``unicode`` or ``None``
        :param normalization: The nomalisation form to apply to ``text``.
        :type normalization: ``unicode`` or ``None``
        :returns: decoded and normalised ``unicode``

        :class:`Workflow` uses "NFC" normalisation by default. This is the
        standard for Python and will work well with data from the web (via
        :mod:`~workflow.web` or :mod:`json`).

        OS X, on the other hand, uses "NFD" normalisation (nearly), so data
        coming from the system (e.g. via :mod:`subprocess` or
        :func:`os.listdir`/:mod:`os.path`) may not match. You should either
        normalise this data, too, or change the default normalisation used by
        :class:`Workflow`.

        """
        encoding = encoding or self._input_encoding
        normalization = normalization or self._normalizsation
        if not isinstance(text, unicode):
            text = unicode(text, encoding)
        return unicodedata.normalize(normalization, text)
notify.py 文件源码 项目:alfred-mpd 作者: deanishe 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def uni(s):
        """Coerce `s` to normalised Unicode."""
        ustr = s.decode('utf-8')
        return normalize('NFD', ustr)
web.py 文件源码 项目:alfred-mpd 作者: deanishe 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def text(self):
        """Unicode-decoded content of response body.

        If no encoding can be determined from HTTP headers or the content
        itself, the encoded response body will be returned instead.

        :returns: Body of HTTP response
        :rtype: :class:`unicode` or :class:`str`

        """
        if self.encoding:
            return unicodedata.normalize('NFC', unicode(self.content,
                                                        self.encoding))
        return self.content
unicode_utils.py 文件源码 项目:python- 作者: secondtonone1 项目源码 文件源码 阅读 95 收藏 0 点赞 0 评论 0
def decompose(path):
    if isinstance(path, six.text_type):
        return unicodedata.normalize('NFD', path)
    try:
        path = path.decode('utf-8')
        path = unicodedata.normalize('NFD', path)
        path = path.encode('utf-8')
    except UnicodeError:
        pass  # Not UTF-8
    return path
core.py 文件源码 项目:my-first-blog 作者: AnkurBegining 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def check_nfc(label):

    if unicodedata.normalize('NFC', label) != label:
        raise IDNAError('Label must be in Normalization Form C')
core.py 文件源码 项目:my-first-blog 作者: AnkurBegining 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def uts46_remap(domain, std3_rules=True, transitional=False):
    """Re-map the characters in the string according to UTS46 processing."""
    from .uts46data import uts46data
    output = u""
    try:
        for pos, char in enumerate(domain):
            code_point = ord(char)
            uts46row = uts46data[code_point if code_point < 256 else
                bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
            status = uts46row[1]
            replacement = uts46row[2] if len(uts46row) == 3 else None
            if (status == "V" or
                    (status == "D" and not transitional) or
                    (status == "3" and std3_rules and replacement is None)):
                output += char
            elif replacement is not None and (status == "M" or
                    (status == "3" and std3_rules) or
                    (status == "D" and transitional)):
                output += replacement
            elif status != "I":
                raise IndexError()
        return unicodedata.normalize("NFC", output)
    except IndexError:
        raise InvalidCodepoint(
            "Codepoint {0} not allowed at position {1} in {2}".format(
            _unot(code_point), pos + 1, repr(domain)))


问题


面经


文章

微信
公众号

扫码关注公众号