def filename(self):
''' Name of the file on the client file system, but normalized to ensure
file system compatibility. An empty filename is returned as 'empty'.
Only ASCII letters, digits, dashes, underscores and dots are
allowed in the final filename. Accents are removed, if possible.
Whitespace is replaced by a single dash. Leading or tailing dots
or dashes are removed. The filename is limited to 255 characters.
'''
fname = self.raw_filename
if not isinstance(fname, unicode):
fname = fname.decode('utf8', 'ignore')
fname = normalize('NFKD', fname).encode('ASCII', 'ignore').decode('ASCII')
fname = os.path.basename(fname.replace('\\', os.path.sep))
fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
return fname[:255] or 'empty'
python类normalize()的实例源码
def __init__(self, form='NFKC', strip=True, collapse=True, hyphens=False, quotes=False, ellipsis=False,
slashes=False, tildes=False):
"""
:param string form: Normal form for unicode normalization.
:param bool strip: Whether to strip whitespace from start and end.
:param bool collapse: Whether to collapse all whitespace (tabs, newlines) down to single spaces.
:param bool hyphens: Whether to normalize all hyphens, minuses and dashes to the ASCII hyphen-minus character.
:param bool quotes: Whether to normalize all apostrophes, quotes and primes to the ASCII quote character.
:param bool ellipsis: Whether to normalize ellipses to three full stops.
:param bool slashes: Whether to normalize slash characters to the ASCII slash character.
:param bool tildes: Whether to normalize tilde characters to the ASCII tilde character.
"""
self.form = form
self.strip = strip
self.collapse = collapse
self.hyphens = hyphens
self.quotes = quotes
self.ellipsis = ellipsis
self.slashes = slashes
self.tildes = tildes
def normalize_string(text):
import unicodedata
text = text.replace(":", "")
text = text.replace("/", "-")
text = text.replace("\\", "-")
text = text.replace("<", "")
text = text.replace(">", "")
text = text.replace("*", "")
text = text.replace("?", "")
text = text.replace('|', "")
text = text.replace('(', "")
text = text.replace(')', "")
text = text.replace("\"", "")
text = text.strip()
text = text.rstrip('.')
text = unicodedata.normalize('NFKD', try_decode(text))
return text
def filename(self):
''' Name of the file on the client file system, but normalized to ensure
file system compatibility. An empty filename is returned as 'empty'.
Only ASCII letters, digits, dashes, underscores and dots are
allowed in the final filename. Accents are removed, if possible.
Whitespace is replaced by a single dash. Leading or tailing dots
or dashes are removed. The filename is limited to 255 characters.
'''
fname = self.raw_filename
if not isinstance(fname, unicode):
fname = fname.decode('utf8', 'ignore')
fname = normalize('NFKD', fname).encode('ASCII', 'ignore').decode('ASCII')
fname = os.path.basename(fname.replace('\\', os.path.sep))
fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
return fname[:255] or 'empty'
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
def filename(self):
""" Name of the file on the client file system, but normalized to ensure
file system compatibility. An empty filename is returned as 'empty'.
Only ASCII letters, digits, dashes, underscores and dots are
allowed in the final filename. Accents are removed, if possible.
Whitespace is replaced by a single dash. Leading or tailing dots
or dashes are removed. The filename is limited to 255 characters.
"""
fname = self.raw_filename
if not isinstance(fname, unicode):
fname = fname.decode('utf8', 'ignore')
fname = normalize('NFKD', fname).encode('ASCII', 'ignore').decode('ASCII')
fname = os.path.basename(fname.replace('\\', os.path.sep))
fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
return fname[:255] or 'empty'
def filename(self):
""" Name of the file on the client file system, but normalized to ensure
file system compatibility. An empty filename is returned as 'empty'.
Only ASCII letters, digits, dashes, underscores and dots are
allowed in the final filename. Accents are removed, if possible.
Whitespace is replaced by a single dash. Leading or tailing dots
or dashes are removed. The filename is limited to 255 characters.
"""
fname = self.raw_filename
if not isinstance(fname, unicode):
fname = fname.decode('utf8', 'ignore')
fname = normalize('NFKD', fname)
fname = fname.encode('ASCII', 'ignore').decode('ASCII')
fname = os.path.basename(fname.replace('\\', os.path.sep))
fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
return fname[:255] or 'empty'
def urlify(s, maxlen=80, keep_underscores=False):
"""
Converts incoming string to a simplified ASCII subset.
if (keep_underscores): underscores are retained in the string
else: underscores are translated to hyphens (default)
"""
s = to_unicode(s) # to unicode
s = s.lower() # to lowercase
s = unicodedata.normalize('NFKD', s) # replace special characters
s = to_native(s, charset='ascii', errors='ignore') # encode as ASCII
s = re.sub('&\w+?;', '', s) # strip html entities
if keep_underscores:
s = re.sub('\s+', '-', s) # whitespace to hyphens
s = re.sub('[^\w\-]', '', s)
# strip all but alphanumeric/underscore/hyphen
else:
s = re.sub('[\s_]+', '-', s) # whitespace & underscores to hyphens
s = re.sub('[^a-z0-9\-]', '', s) # strip all but alphanumeric/hyphen
s = re.sub('[-_][-_]+', '-', s) # collapse strings of hyphens
s = s.strip('-') # remove leading and trailing hyphens
return s[:maxlen] # enforce maximum length
def remove_accents(self, string):
nkfd_form = unicodedata.normalize('NFKD', str(string))
return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
def remove_accents(self, string):
nkfd_form = unicodedata.normalize('NFKD', str(string))
return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
def remove_accents(self, string):
nkfd_form = unicodedata.normalize('NFKD', str(string))
return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
def remove_accents(self, string):
nkfd_form = unicodedata.normalize('NFKD', str(string))
return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
def _convert_transaction(transaction):
date = transaction['date'].strftime("%Y%m%d%H%M%S")
return dict2xml.convert("STMTTRN", {
"DTPOSTED": date,
"FITID": date,
"TRNAMT": transaction['signal'] + transaction['amount'],
"MEMO": unicodedata.normalize('NFD',
transaction['description']).encode('ascii', 'ignore'),
})
def decode(self, text, encoding=None, normalization=None):
"""Return ``text`` as normalised unicode.
If ``encoding`` and/or ``normalization`` is ``None``, the
``input_encoding``and ``normalization`` parameters passed to
:class:`Workflow` are used.
:param text: string
:type text: encoded or Unicode string. If ``text`` is already a
Unicode string, it will only be normalised.
:param encoding: The text encoding to use to decode ``text`` to
Unicode.
:type encoding: ``unicode`` or ``None``
:param normalization: The nomalisation form to apply to ``text``.
:type normalization: ``unicode`` or ``None``
:returns: decoded and normalised ``unicode``
:class:`Workflow` uses "NFC" normalisation by default. This is the
standard for Python and will work well with data from the web (via
:mod:`~workflow.web` or :mod:`json`).
OS X, on the other hand, uses "NFD" normalisation (nearly), so data
coming from the system (e.g. via :mod:`subprocess` or
:func:`os.listdir`/:mod:`os.path`) may not match. You should either
normalise this data, too, or change the default normalisation used by
:class:`Workflow`.
"""
encoding = encoding or self._input_encoding
normalization = normalization or self._normalizsation
if not isinstance(text, unicode):
text = unicode(text, encoding)
return unicodedata.normalize(normalization, text)
def uni(s):
"""Coerce `s` to normalised Unicode."""
ustr = s.decode('utf-8')
return normalize('NFD', ustr)
def text(self):
"""Unicode-decoded content of response body.
If no encoding can be determined from HTTP headers or the content
itself, the encoded response body will be returned instead.
:returns: Body of HTTP response
:rtype: :class:`unicode` or :class:`str`
"""
if self.encoding:
return unicodedata.normalize('NFC', unicode(self.content,
self.encoding))
return self.content
def decompose(path):
if isinstance(path, six.text_type):
return unicodedata.normalize('NFD', path)
try:
path = path.decode('utf-8')
path = unicodedata.normalize('NFD', path)
path = path.encode('utf-8')
except UnicodeError:
pass # Not UTF-8
return path
def check_nfc(label):
if unicodedata.normalize('NFC', label) != label:
raise IDNAError('Label must be in Normalization Form C')
def uts46_remap(domain, std3_rules=True, transitional=False):
"""Re-map the characters in the string according to UTS46 processing."""
from .uts46data import uts46data
output = u""
try:
for pos, char in enumerate(domain):
code_point = ord(char)
uts46row = uts46data[code_point if code_point < 256 else
bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
status = uts46row[1]
replacement = uts46row[2] if len(uts46row) == 3 else None
if (status == "V" or
(status == "D" and not transitional) or
(status == "3" and std3_rules and replacement is None)):
output += char
elif replacement is not None and (status == "M" or
(status == "3" and std3_rules) or
(status == "D" and transitional)):
output += replacement
elif status != "I":
raise IndexError()
return unicodedata.normalize("NFC", output)
except IndexError:
raise InvalidCodepoint(
"Codepoint {0} not allowed at position {1} in {2}".format(
_unot(code_point), pos + 1, repr(domain)))