def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
python类category()的实例源码
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def normalize(title):
try:
try: return title.decode('ascii').encode("utf-8")
except: pass
return str(''.join(c for c in unicodedata.normalize('NFKD', unicode(title.decode('utf-8'))) if unicodedata.category(c) != 'Mn'))
except:
return title
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def pinyinify(string):
# TODO: Use static file instead of constructing table in real time
table = dict()
for i in range(sys.maxunicode):
if re.match('P|S|Z|C', unicodedata.category(chr(i))) is not None:
table[i] = '-'
string = string.translate(table)
for char in [x for x in string if unicodedata.name(x).startswith('CJK')]:
string = string.replace(char, pinyin.get(char, format='strip') + '-')
string = re.sub('\-+', '-', string)
return pinyin.get(string, delimiter='', format='strip').lower()
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def replace_punctuation(text, sub):
punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
chars = []
for my_char in text:
if unicodedata.category(my_char) in punctutation_cats:
chars.append(sub)
else:
chars.append(my_char)
return u"".join(chars)
# from http://stackoverflow.com/a/22238613/596939
def sanitize(name):
name = unicode(name)
printable = set(('Lu', 'Ll', 'Lm', 'Lo', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Zs'))
return utf8(''.join(c for c in name if unicodedata.category(c) in printable and c != '@'))
def Plaintext(self):
data = ''
for category, code in sorted(FontFile.NAME_CODES.items(),
key=lambda x:x[1]):
if code in self.font._names:
data += '%15s: %s\n' % (category, self.font._names[code])
return data
def XetexBody(self):
data = ''
for category, code in sorted(FontFile.NAME_CODES.items(),
key=lambda x:x[1]):
if code in self.font._names:
data += '%s & %s \\\\\n' % (category,
TexEscape(self.font._names[code]))
return data
def _double_width_char_count(word):
dw_count = 0
for char in word:
if _unicode_data.category(char) in _double_width_type:
dw_count += 1
return dw_count
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def check_initial_combiner(label):
if unicodedata.category(label[0])[0] == 'M':
raise IDNAError('Label begins with an illegal combining character')
return True
def _make_unctrl_map():
uc_map = {}
for c in map(unichr, range(256)):
if unicodedata.category(c)[0] != 'C':
uc_map[c] = c
for i in range(32):
c = unichr(i)
uc_map[c] = '^' + unichr(ord('A') + i - 1)
uc_map[b'\t'] = ' ' # display TABs as 4 characters
uc_map[b'\177'] = unicode('^?')
for i in range(256):
c = unichr(i)
if c not in uc_map:
uc_map[c] = unicode('\\%03o') % i
return uc_map
def _my_unctrl(c, u=_make_unctrl_map()):
if c in u:
return u[c]
else:
if unicodedata.category(c).startswith('C'):
return b'\u%04x' % ord(c)
else:
return c
def is_punct(text):
for char in text:
if not unicodedata.category(char).startswith('P'):
return False
else:
return True