def __call__(self, unicode_text):
'''
Runs the parser.
Args:
unicode_text: a unicode document
Returns:
text: An ascii equivalent of unicode_text
'''
return unidecode.unidecode(unicode_text)
# if __name__ == "__main__":
# text = u"?-Helix ?-sheet ?? ?? ?? ?? ?? ??"
# parser = unidecoder()
# print(parser(text))
python类unidecode()的实例源码
def normalize(self, s):
'''
Normalize text.
'''
s = s.strip().lower()
if self.to_ascii:
s = unidecode(s)
if self.rejoin_lines:
s = re.sub(r'(\w-)\s*\n\s*', r'\1', s, flags=_RE_FLAGS)
if self.remove_hyphens:
s = re.sub(r'([^\W\d_])-+(?=[^\W\d_])', r'\1', s, flags=_RE_FLAGS)
if self.remove_specials:
s = re.sub(r'(\D|^)([^\w\s]|_)+(?=\D|$)', r'\1 ', s,
flags=_RE_FLAGS)
s = re.sub(r'(\w)([^\w\s]|_)+\s+', r'\1 ', s, flags=_RE_FLAGS)
s = re.sub(r'\s+([^\w\s]|_)+(?=\w)', r'\1 ', s, flags=_RE_FLAGS)
for pattern, replacement in self.subs:
s = re.sub(pattern, replacement, s, flags=_RE_FLAGS)
if self._stemmer:
callback = lambda m: self._stemmer.stem(m.group())
s = re.sub(r'([^\W\d_]|-)+', callback, s, flags=_RE_FLAGS)
s = re.sub(r'\s+', ' ', s, flags=_RE_FLAGS)
return s.strip()
def get_type(self, text):
text = unidecode(text).lower().strip()
type = None
stop_pos = re.search(r'(pentru|privind)', text).start()
if stop_pos:
text = text[0:stop_pos]
if re.search(r'ordin', text):
type = 'OM'
if re.search(r'lege', text):
type = 'LEGE'
if re.search(r'hotarare', text):
type = 'HG'
if re.search(r'ordonanta', text):
if re.search(r'urgenta', text):
type = 'OUG'
else:
type = 'OG'
return type
def get_feedback_date(self, text):
formats = ['%d %B %Y', '%d.%m.%Y']
text = unidecode(text.strip().lower())
phrase = re.search(r'data limita.*((\d\d?\.\d\d?\.20\d\d)|(\d\d?\s[a-z]+\s20\d\d))', text)
if phrase:
date = re.search(r'(\d\d?\.\d\d?\.20\d\d)|(\d\d?\s[a-z]+\s20\d\d)', phrase.group(0))
if date:
date = date.group(0)
for format in formats:
try:
result = datetime.datetime.strptime(date, format)
if result:
return result
except ValueError:
pass
def get_type(self, text):
text = unidecode(text).lower().strip()
type = None
stop_pos = re.search(r'(pentru|privind)', text)
if stop_pos:
text = text[0:stop_pos.start()]
if re.search(r'ordin', text):
type = 'OM'
if re.search(r'lege', text):
type = 'LEGE'
if re.search(r'hotarare', text):
type = 'HG'
if re.search(r'ordonanta', text):
if re.search(r'urgenta', text):
type = 'OUG'
else:
type = 'OG'
return type
def __init__(self, audio_dir=os.curdir, audio_rate=11025, mod_path=os.curdir,
name=None, play_key='F8', relay_key='=', use_aliases=True):
"""
Args:
audio_dir (str): Path for finding audio.
audio_rate (int): The sample rate the game accepts.
mod_path (str): Path to the mod folder (e.g. "Steam/SteamApps/common/Team Fortress 2/tf2")
name (str): The name of the game.
play_key (str): The key used to start/stop music in-game.
relay_key (str): The key used to interact with the game.
use_aliases (bool): Whether or not to use aliases to select songs in-game.
"""
self.audio_dir = audio_dir
self.audio_rate = audio_rate
self.mod_path = mod_path
self.name = unidecode.unidecode(name)
self.play_key = play_key if bindable(play_key) else "F8"
self.relay_key = relay_key if bindable(relay_key) else "="
self.use_aliases = use_aliases
def output_preprocessed_data(self, json_input, file_name):
'''
Output preprocessed data into a file.
:param json_input: json formatted data generated from function str_process
:param file_name: output file name
:return: None
'''
rows = []
for sent in json_input['sentences']:
parsed_sent = " ".join([t['originalText'] + "/" + t['pos'] for t in sent['tokens']])
rows.append(parsed_sent)
output_file_path = self.output_folder + '/' + file_name
with open(output_file_path, 'a') as preprocessed_out:
for r in rows:
preprocessed_out.write(unidecode.unidecode(r) + "\n")
def parseToJsonStr(self, metadata: dict) -> Optional[str]:
"""
:return: json string or None if no matching non-empty metadata found
"""
jsonDict = {}
for md, possibleKeys in self.__rulesDict.items():
for key in possibleKeys:
if key in metadata:
value = metadata.get(key)
if len(value) > 0:
jsonDict[md.value] = unidecode(value)
# found first value, skipping other possible keys for the metadata
break
if len(jsonDict) > 0:
return json.dumps(jsonDict)
else:
return None
def similar_users(user):
if not type(user) is str:
user = unidecode.unidecode(user)
if db.done_users.find_one({'user':user})['recommended']==False:
user_files = db.user_list.find({'user':user})
f = open('./dc_recom.dat','a')
for u in user_files:
f.write(u['user'] + '::' + u['tth'])
f.write('\n')
f.close()
db.done_users.update({'user': user}, {'user':user, 'recommended': True})
data = Data()
data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0})
svd = SVD()
svd.set_data(data)
svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
return [i[0] for i in svd.similar(user)]
def initialCheckName(self, name):
'''Check if name is written in Cyrillic or Greek script, and transliterate'''
if only_cyrillic_chars(name) or only_greek_chars(name):
name = unidecode(name)
'''Initial check for gender-specific words at the beginning of the name'''
f = name.split()[0]
if f in self.maleWords:
conf = 1
return ('male',conf)
elif f in self.femaleWords:
conf = 1
return ('female', conf)
'''Check for gender-specific words at the second part of the name'''
if len(name.split())> 1:
l = name.split()[1]
if l in self.maleWords:
conf = 1
return ('male',conf)
elif l in self.femaleWords:
conf = 1
return ('female', conf)
return (None,0)
def get_statements_by_person(self, first_name, last_name, limit=0):
"""
Get statements and ratings by name.
@param first_name: of MoC
@param last_name: of MoC
@param limit: optional limit
@return: statements
"""
limit = limit if limit > 0 else 10
results = self._get(
"statements/truth-o-meter/people/{first_name}-{last_name}/"
"json/?n={limit}".format(first_name=unidecode(first_name.lower()),
last_name=unidecode(last_name.lower()),
limit=limit))
return results if results else []
def descarga(full_name):
url = 'https://file.io/?expires=1w'
files = {'file': open(full_name,'rb')}
print("\n\tSubiendo archivo a 'file.io'")
link = None
n=0
while link==None: # For ensuring that the file is uploaded correctly
response = requests.post(url, files=files)
test = response.text
print("JSON recibido: ",test)
decoded = unidecode(test) # It's needed to decode text for avoiding 'bytes' problems (b'<meta...)
print("JSON decodificado: ",decoded)
if '<html>' in decoded: # When upload fails, 'file.io' sends a message with <html> header.
print("\n\tFallo al subir el archivo. Reintentando... #",n) # If it's detected, assings 'link = None' and then 'while' loop restars
link = None
n=n+1 # Little counter
else:
json_data = json.loads(decoded)
link = json_data['link']
print("\n\nEnlace de descarga directa: ",link)
return link
def fuzzy_match_strings(ref, val):
"""
Returns the matching score of two values.
"""
if not ref or not val:
return 0
ref_q = to_q(ref)
val_q = to_q(val)
if ref_q or val_q:
return 100 if ref_q == val_q else 0
simplified_val = unidecode(val).lower()
simplified_ref = unidecode(ref).lower()
# Return symmetric score
r1 = fuzz.token_sort_ratio(simplified_val, simplified_ref)
r2 = fuzz.token_sort_ratio(simplified_ref, simplified_val)
r2 = r1
return int(0.5*(r1+r2))
def emoji(self, context):
''' Sends a text and replace letters with regional indicators '''
from unidecode import unidecode
content = self.bot.get_text(context)
if content in [None, '', ' '] or context.invoked_with == 'riz' and not self.bot.is_owner(context.message.author):
return
msg = ''
if context.invoked_with in ['ri', 'bi']:
msg += '`{}`: '.format(context.message.author)
for c in content:
if c.isalpha():
b = context.invoked_with == 'bi' and c in ['b', 'B', 'p', 'P']
if b:
msg += ':b:'
else:
msg += ':regional_indicator_{}:'.format(unidecode(c.lower()))
else:
msg += c
await self.bot.say(msg)
await self.bot.replied(context)
if context.invoked_with in ['ri', 'riz', 'bi']:
try:
await self.bot.delete_message(context.message)
except discord.errors.Forbidden:
pass
def validate_folder(self):
"""Validates whether a folder can be created.
Performs two types of validation:
1. Checks if a DB entry is present.
2. Checks if a physical folder exists in the system."""
unicoded_title = "".join((i if ord(i) < 128 else '_') for i in unidecode(self.title))
parent_folder = self.folder
if parent_folder:
if ImageFolder.objects.filter(folder=parent_folder, title=self.title).count() > 0:
raise ValidationError("Folder exists in the DB!", code='db')
folder_path = os.path.join(settings.MEDIA_ROOT, parent_folder.path, unicoded_title)
if os.path.isdir(folder_path):
raise ValidationError("Folder exists in the OS!", code='os')
else:
if ImageFolder.objects.filter(folder__isnull=True, title=self.title).count() > 0:
raise ValidationError("Folder exists in the DB!", code='db')
folder_path = os.path.join(settings.MEDIA_ROOT, IMAGES_FOLDER_NAME, unicoded_title)
if os.path.isdir(folder_path):
raise ValidationError("Folder exists in the OS!", code='os')
def get_upload_to(self, filename):
filename = self.file.field.storage.get_valid_name(filename)
# do a unidecode in the filename and then
# replace non-ascii characters in filename with _ , to sidestep issues with filesystem encoding
filename = "".join((i if ord(i) < 128 else '_') for i in unidecode(filename))
# Truncate filename so it fits in the 100 character limit
# https://code.djangoproject.com/ticket/9893
if self.folder:
full_path = os.path.join(self.folder.path, filename)
else:
full_path = os.path.join(IMAGES_FOLDER_NAME, filename)
if len(full_path) >= 95:
chars_to_trim = len(full_path) - 94
prefix, extension = os.path.splitext(filename)
filename = prefix[:-chars_to_trim] + extension
if self.folder:
full_path = os.path.join(self.folder.path, filename)
else:
full_path = os.path.join(IMAGES_FOLDER_NAME, filename)
return full_path
def get_members_missing(members_current, members_current_check):
members_missing = []
for member_check in members_current_check:
found = False
member_check_name = unidecode(member_check['name'])
member_check_forename = unidecode(member_check['forename'])
for member in members_current:
member_name = unidecode(member.person.surname_including_prefix())
if member_check_name == member_name and member_check_forename == unidecode(member.person.forename):
found = True
break
if not found:
members_missing.append(
member_check['initials'] + ' ' + member_check['name'] + ' (' + member_check['forename'] + ')')
# print(member_check['name'])
return members_missing
def get_members_incorrect(members_current, members_current_check):
members_incorrect = []
for member in members_current:
found = False
member_name = unidecode(member.person.surname_including_prefix())
member_forename = unidecode(member.person.forename)
for member_check in members_current_check:
member_check_name = unidecode(member_check['name'])
member_check_forename = unidecode(member_check['forename'])
if member_check_name == member_name and member_check_forename == member_forename:
found = True
break
if not found:
members_incorrect.append(member)
# print(member.person.fullname())
return members_incorrect
def find_party(name):
name_ascii = unidecode(name)
name_lid = 'Lid-' + name
name_no_dash = name.replace('-', ' ')
parties = PoliticalParty.objects.filter(name__iexact=name) \
| PoliticalParty.objects.filter(name__iexact=name_ascii) \
| PoliticalParty.objects.filter(name__iexact=name_lid) \
| PoliticalParty.objects.filter(name__iexact=name_no_dash)
if parties.exists():
return parties[0]
parties = PoliticalParty.objects.filter(name_short__iexact=name) \
| PoliticalParty.objects.filter(name_short__iexact=name_ascii) \
| PoliticalParty.objects.filter(name_short__iexact=name_lid) \
| PoliticalParty.objects.filter(name_short__iexact=name_no_dash)
if parties.exists():
return parties[0]
logger.warning('party not found: ' + name)
return None