def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
python类normalize()的实例源码
def clean_filename(filename):
"""Return a sanitized filename (replace / strip out illegal characters)
:param filename: string used for a filename
:type filename: str
:return: sanitized filename
:rtype: str
"""
return ''.join([
c for c in unicodedata.normalize(
'NFKD',
''.join([REPLACEMENT_CHAR.get(c, c) for c in filename])
)
if not unicodedata.combining(c) and c in '-_.() {0}{1}'.format(string.ascii_letters, string.digits)
])
def filename(self):
""" Name of the file on the client file system, but normalized to ensure
file system compatibility. An empty filename is returned as 'empty'.
Only ASCII letters, digits, dashes, underscores and dots are
allowed in the final filename. Accents are removed, if possible.
Whitespace is replaced by a single dash. Leading or tailing dots
or dashes are removed. The filename is limited to 255 characters.
"""
fname = self.raw_filename
if not isinstance(fname, unicode):
fname = fname.decode('utf8', 'ignore')
fname = normalize('NFKD', fname)
fname = fname.encode('ASCII', 'ignore').decode('ASCII')
fname = os.path.basename(fname.replace('\\', os.path.sep))
fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip()
fname = re.sub(r'[-\s]+', '-', fname).strip('.-')
return fname[:255] or 'empty'
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
def decode_as_string(text, encoding=None):
"""
Decode the console or file output explicitly using getpreferredencoding.
The text paraemeter should be a encoded string, if not no decode occurs
If no encoding is given, getpreferredencoding is used. If encoding is
specified, that is used instead. This would be needed for SVN --xml
output. Unicode is explicitly put in composed NFC form.
--xml should be UTF-8 (SVN Issue 2938) the discussion on the Subversion
DEV List from 2007 seems to indicate the same.
"""
#text should be a byte string
if encoding is None:
encoding = _console_encoding
if not isinstance(text, unicode):
text = text.decode(encoding)
text = unicodedata.normalize('NFC', text)
return text
def delete_friends(request):
current_username = request.POST.get('username')
current_friendName = request.POST.get('friendUsername')
ol=[]
try:
existingUser = FriendList.objects.get(user__username = current_username)
user_friends = existingUser.getfoo()
for c in user_friends:
c = unicodedata.normalize('NFKD', c).encode('ascii','ignore')
if(c == current_friendName):
continue
ol.append(c)
existingUser.friendList = json.dumps(ol)
existingUser.save()
except:
ol=[]
return HttpResponse(json.dumps(ol))
def GetLineWidth(line):
"""Determines the width of the line in column positions.
Args:
line: A string, which may be a Unicode string.
Returns:
The width of the line in column positions, accounting for Unicode
combining characters and wide characters.
"""
if isinstance(line, unicode):
width = 0
for uc in unicodedata.normalize('NFC', line):
if unicodedata.east_asian_width(uc) in ('W', 'F'):
width += 2
elif not unicodedata.combining(uc):
width += 1
return width
else:
return len(line)
def append_utf8(self, text):
try:
from Naked.toolshed.system import file_exists
if not file_exists(self.filepath):
raise IOError("The file specified for the text append does not exist (Naked.toolshed.file.py:append_utf8).")
import codecs
import unicodedata
norm_text = unicodedata.normalize('NFKD', text) # NKFD normalization of the unicode data before write
with codecs.open(self.filepath, mode='a', encoding="utf_8") as appender:
appender.write(norm_text)
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: Unable to append text to the file with the append_utf8 method (Naked.toolshed.file.py).")
raise e
#------------------------------------------------------------------------------
# [ gzip method (writer) ]
# writes data to gzip compressed file
# Note: adds .gz extension to filename if user did not specify it in the FileWriter class constructor
# Note: uses compresslevel = 6 as default to balance speed and compression level (which in general is not significantly less than 9)
# Tests: test_IO.py :: test_file_gzip_ascii_readwrite, test_file_gzip_utf8_readwrite,
# test_file_gzip_utf8_readwrite_explicit_decode
#------------------------------------------------------------------------------
def gzip(self, text, compression_level=6):
try:
import gzip
if not self.filepath.endswith(".gz"):
self.filepath = self.filepath + ".gz"
with gzip.open(self.filepath, 'wb', compresslevel=compression_level) as gzip_writer:
gzip_writer.write(text)
except UnicodeEncodeError as ue:
import unicodedata
norm_text = unicodedata.normalize('NFKD', text) # NKFD normalization of the unicode data before write
import codecs
binary_data = codecs.encode(norm_text, "utf_8")
with gzip.open(self.filepath, 'wb', compresslevel=compression_level) as gzip_writer:
gzip_writer.write(binary_data)
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: unable to gzip compress the file with the gzip method (Naked.toolshed.file.py).")
raise e
#------------------------------------------------------------------------------
# [ write method ]
# Universal text file writer that writes by system default or utf-8 encoded unicode if throws UnicdeEncodeError
# Tests: test_IO.py :: test_file_ascii_readwrite, test_file_ascii_readwrite_missing_file,
# test_file_utf8_write_raises_unicodeerror
#------------------------------------------------------------------------------
def readlines_utf8(self):
try:
import codecs
with codecs.open(self.filepath, encoding='utf-8', mode='r') as uni_reader:
modified_text_list = []
for line in uni_reader:
import unicodedata
norm_line = unicodedata.normalize('NFKD', line) # NKFD normalization of the unicode data before use
modified_text_list.append(norm_line)
return modified_text_list
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: unable to read lines in the unicode file with the readlines_utf8 method (Naked.toolshed.file.py)")
raise e
#------------------------------------------------------------------------------
# [ read_gzip ] (byte string)
# reads data from a gzip compressed file
# returns the decompressed binary data from the file
# Note: if decompressing unicode file, set encoding="utf-8"
# Tests: test_IO.py :: test_file_gzip_ascii_readwrite, test_file_gzip_utf8_readwrite,
# test_file_read_gzip_missing_file
#------------------------------------------------------------------------------
def read_utf8(self):
try:
import codecs
f = codecs.open(self.filepath, encoding='utf_8', mode='r')
except IOError as ioe:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: Unable to open file for read with read_utf8() method (Naked.toolshed.file.py).")
raise ioe
try:
textstring = f.read()
import unicodedata
norm_text = unicodedata.normalize('NFKD', textstring) # NKFD normalization of the unicode data before returns
return norm_text
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: Unable to read the file with UTF-8 encoding using the read_utf8() method (Naked.toolshed.file.py).")
raise e
finally:
f.close()
def append_utf8(self, text):
try:
from Naked.toolshed.system import file_exists
if not file_exists(self.filepath):
raise IOError("The file specified for the text append does not exist (Naked.toolshed.file.py:append_utf8).")
import codecs
import unicodedata
norm_text = unicodedata.normalize('NFKD', text) # NKFD normalization of the unicode data before write
with codecs.open(self.filepath, mode='a', encoding="utf_8") as appender:
appender.write(norm_text)
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: Unable to append text to the file with the append_utf8 method (Naked.toolshed.file.py).")
raise e
#------------------------------------------------------------------------------
# [ gzip method (writer) ]
# writes data to gzip compressed file
# Note: adds .gz extension to filename if user did not specify it in the FileWriter class constructor
# Note: uses compresslevel = 6 as default to balance speed and compression level (which in general is not significantly less than 9)
# Tests: test_IO.py :: test_file_gzip_ascii_readwrite, test_file_gzip_utf8_readwrite,
# test_file_gzip_utf8_readwrite_explicit_decode
#------------------------------------------------------------------------------
def gzip(self, text, compression_level=6):
try:
import gzip
if not self.filepath.endswith(".gz"):
self.filepath = self.filepath + ".gz"
with gzip.open(self.filepath, 'wb', compresslevel=compression_level) as gzip_writer:
gzip_writer.write(text)
except UnicodeEncodeError as ue:
import unicodedata
norm_text = unicodedata.normalize('NFKD', text) # NKFD normalization of the unicode data before write
import codecs
binary_data = codecs.encode(norm_text, "utf_8")
with gzip.open(self.filepath, 'wb', compresslevel=compression_level) as gzip_writer:
gzip_writer.write(binary_data)
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: unable to gzip compress the file with the gzip method (Naked.toolshed.file.py).")
raise e
#------------------------------------------------------------------------------
# [ write method ]
# Universal text file writer that writes by system default or utf-8 encoded unicode if throws UnicdeEncodeError
# Tests: test_IO.py :: test_file_ascii_readwrite, test_file_ascii_readwrite_missing_file,
# test_file_utf8_write_raises_unicodeerror
#------------------------------------------------------------------------------
def write_utf8(self, text):
try:
import codecs
f = codecs.open(self.filepath, encoding='utf_8', mode='w')
except IOError as ioe:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: Unable to open file for write with the write_utf8() method (Naked.toolshed.file.py).")
raise ioe
try:
import unicodedata
norm_text = unicodedata.normalize('NFKD', text) # NKFD normalization of the unicode data before write
f.write(norm_text)
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: Unable to write UTF-8 encoded text to file with the write_utf8() method (Naked.toolshed.file.py).")
raise e
finally:
f.close()
#------------------------------------------------------------------------------
# [ FileReader class ]
# reads data from local files
# filename assigned in constructor (inherited from IO class interface)
#------------------------------------------------------------------------------
def readlines_utf8(self):
try:
import codecs
with codecs.open(self.filepath, encoding='utf-8', mode='r') as uni_reader:
modified_text_list = []
for line in uni_reader:
import unicodedata
norm_line = unicodedata.normalize('NFKD', line) # NKFD normalization of the unicode data before use
modified_text_list.append(norm_line)
return modified_text_list
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: unable to read lines in the unicode file with the readlines_utf8 method (Naked.toolshed.file.py)")
raise e
#------------------------------------------------------------------------------
# [ read_gzip ] (byte string)
# reads data from a gzip compressed file
# returns the decompressed binary data from the file
# Note: if decompressing unicode file, set encoding="utf-8"
# Tests: test_IO.py :: test_file_gzip_ascii_readwrite, test_file_gzip_utf8_readwrite,
# test_file_read_gzip_missing_file
#------------------------------------------------------------------------------
def read_utf8(self):
try:
import codecs
f = codecs.open(self.filepath, encoding='utf_8', mode='r')
except IOError as ioe:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: Unable to open file for read with read_utf8() method (Naked.toolshed.file.py).")
raise ioe
try:
textstring = f.read()
import unicodedata
norm_text = unicodedata.normalize('NFKD', textstring) # NKFD normalization of the unicode data before returns
return norm_text
except Exception as e:
if DEBUG_FLAG:
sys.stderr.write("Naked Framework Error: Unable to read the file with UTF-8 encoding using the read_utf8() method (Naked.toolshed.file.py).")
raise e
finally:
f.close()
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
def sanitize_separators(value):
"""
Sanitizes a value according to the current decimal and
thousand separator setting. Used with form field input.
"""
if settings.USE_L10N and isinstance(value, six.string_types):
parts = []
decimal_separator = get_format('DECIMAL_SEPARATOR')
if decimal_separator in value:
value, decimals = value.split(decimal_separator, 1)
parts.append(decimals)
if settings.USE_THOUSAND_SEPARATOR:
thousand_sep = get_format('THOUSAND_SEPARATOR')
if thousand_sep == '.' and value.count('.') == 1 and len(value.split('.')[-1]) != 3:
# Special case where we suspect a dot meant decimal separator (see #22171)
pass
else:
for replacement in {
thousand_sep, unicodedata.normalize('NFKD', thousand_sep)}:
value = value.replace(replacement, '')
parts.append(value)
value = '.'.join(reversed(parts))
return value
def chars(self, num, truncate=None, html=False):
"""
Returns the text truncated to be no longer than the specified number
of characters.
Takes an optional argument of what should be used to notify that the
string has been truncated, defaulting to a translatable string of an
ellipsis (...).
"""
length = int(num)
text = unicodedata.normalize('NFC', self._wrapped)
# Calculate the length to truncate to (max length - end_text length)
truncate_len = length
for char in self.add_truncation_text('', truncate):
if not unicodedata.combining(char):
truncate_len -= 1
if truncate_len == 0:
break
if html:
return self._truncate_html(length, truncate, text, truncate_len, False)
return self._text_chars(length, truncate, text, truncate_len)
def getLcdPiconName(serviceName):
#remove the path and name fields, and replace ':' by '_'
sname = '_'.join(GetWithAlternative(serviceName).split(':', 10)[:10])
pngname = findLcdPicon(sname)
if not pngname:
fields = sname.split('_', 3)
if len(fields) > 2 and fields[2] != '1': #fallback to 1 for services with different service types
fields[2] = '1'
if len(fields) > 0 and fields[0] != '1': #fallback to 1 for other reftypes
fields[0] = '1'
pngname = findLcdPicon('_'.join(fields))
if not pngname: # picon by channel name
name = ServiceReference(serviceName).getServiceName()
name = unicodedata.normalize('NFKD', unicode(name, 'utf_8', errors='ignore')).encode('ASCII', 'ignore')
name = re.sub('[^a-z0-9]', '', name.replace('&', 'and').replace('+', 'plus').replace('*', 'star').lower())
if len(name) > 0:
pngname = findLcdPicon(name)
if not pngname and len(name) > 2 and name.endswith('hd'):
pngname = findLcdPicon(name[:-2])
return pngname
def getPiconLName(serviceName):
#remove the path and name fields, and replace ':' by '_'
sname = '_'.join(GetWithAlternative(serviceName).split(':', 10)[:10])
pngname = findPiconL(sname)
if not pngname:
fields = sname.split('_', 3)
if len(fields) > 2 and fields[2] != '2':
#fallback to 1 for tv services with nonstandard servicetypes
fields[2] = '1'
pngname = findPiconL('_'.join(fields))
if not pngname: # picon by channel name
name = ServiceReference(serviceName).getServiceName()
name = unicodedata.normalize('NFKD', unicode(name, 'utf_8', errors='ignore')).encode('ASCII', 'ignore')
excludeChars = ['/', '\\', '\'', '"', '`', '?', ' ', '(', ')', ':', '<', '>', '|', '.', '\n']
name = re.sub('[%s]' % ''.join(excludeChars), '', name)
name = name.replace('&', 'and')
name = name.replace('+', 'plus')
name = name.replace('*', 'star')
name = name.lower()
if len(name) > 0:
pngname = findPicon(name)
if not pngname and len(name) > 2 and name.endswith('hd'):
pngname = findPicon(name[:-2])
return pngname
def weekday_portuguese_to_english(string):
string = string.lower()
string = string.strip()
string = string.replace("-", " ")
string = ''.join((c for c in unicodedata.normalize('NFD', string)
if unicodedata.category(c) != 'Mn'))
string = string.replace(",", " ")
string = string.split(" ")[0]
if string in [u"dom", u"domingo"]:
return "Sunday"
elif string in [u"seg", u"segunda", u"segunda-feira"]:
return "Monday"
elif string in [u"ter", u"terca", u"terça", u"terca-feira", u"terça-feira"]:
return "Tuesday"
elif string in [u"qua", u"quarta", u"quarta-feira"]:
return "Wednesday"
elif string in [u"qui", u"quinta", u"quinta-feira"]:
return "Thursday"
elif string in [u"sex", u"sexta", u"sexta-feira"]:
return "Friday"
elif string in [u"sab", u"sáb", u"sabado", u"sábado"]:
return "Saturday"
def normalize_string(text):
'''normalize string, strip all special chars'''
text = text.replace(":", "")
text = text.replace("/", "-")
text = text.replace("\\", "-")
text = text.replace("<", "")
text = text.replace(">", "")
text = text.replace("*", "")
text = text.replace("?", "")
text = text.replace('|', "")
text = text.replace('(', "")
text = text.replace(')', "")
text = text.replace("\"", "")
text = text.strip()
text = text.rstrip('.')
if not isinstance(text, unicode):
text = text.decode("utf-8")
text = unicodedata.normalize('NFKD', text)
return text
def to_unicode(source, encoding="utf-8", param="value"):
"""Helper to normalize input to unicode.
:arg source:
source bytes/unicode to process.
:arg encoding:
encoding to use when decoding bytes instances.
:param param:
optional name of variable/noun to reference when raising errors.
:raises TypeError: if source is not unicode or bytes.
:returns:
* returns unicode strings unchanged.
* returns bytes strings decoded using *encoding*
"""
assert encoding
if isinstance(source, unicode):
return source
elif isinstance(source, bytes):
return source.decode(encoding)
else:
raise ExpectedStringError(source, param)
def slugify(value, allow_unicode=False):
"""Slugify string to make it a valid filename.
Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
Remove characters that aren't alphanumerics, underscores, or hyphens.
Also strip leading and trailing whitespace.
"""
import unicodedata
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode(
'ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value).strip()
return re.sub(r'[-\s]+', '-', value)
# Below from
# http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
def charinfo(self, ctx, *, chars):
"""Get unicode character info."""
if not chars:
return
chars = unicodedata.normalize('NFC', chars)
if len(chars) > 25:
await ctx.send('Too many emoji.')
return
embed = discord.Embed()
for char in chars:
uc = hex(ord(char))[2:]
name = unicodedata.name(char, 'unknown')
if name in {'SPACE', 'EM QUAD', 'EN QUAD'} or ' SPACE' in name:
char = '" "'
short = len(uc) <= 4
code = f'`\\{"u" if short else "U"}{uc.lower().zfill(4 if short else 8)}`'
embed.add_field(name=name,
value=f'{char} [{code}](http://www.fileformat.info/info/unicode/char/{uc}/index.htm)')
await ctx.send(embed=embed)
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))