def createData():
spwords = [unidecode(a.lower()) for a in set(nltk.corpus.cess_esp.words()) if len(a)>3]
enwords = [a.lower() for a in set(nltk.corpus.brown.words()) if len(a)>3]
jpwords = [unidecode(a) for a in jeita.words() if (len(unidecode(a)) and unidecode(a)[0].islower())]
jpwords = [a for a in set(jpwords) if len(a)>3]
# minLen = min(len(enwords), len(spwords), len(jpwords))
featuresets = \
[(createTupleDict(w,numChars),'English') for w in enwords] + \
[(createTupleDict(w,numChars),'Spanish') for w in spwords] + \
[(createTupleDict(w,numChars),'Japanese') for w in jpwords]
random.shuffle(featuresets)
l=int(len(featuresets)*0.8)
training_set = featuresets[:l]
testing_set = featuresets[l:]
return (training_set, testing_set)
python类unidecode()的实例源码
def normalize_title(title):
if not title:
return ""
# just first n characters
response = title[0:500]
# lowercase
response = response.lower()
# deal with unicode
response = unidecode(unicode(response))
# has to be before remove_punctuation
# the kind in titles are simple <i> etc, so this is simple
response = clean_html(response)
# remove articles and common prepositions
response = re.sub(ur"\b(the|a|an|of|to|in|for|on|by|with|at|from)\b", u"", response)
# remove everything except alphas
response = remove_everything_but_alphas(response)
return response
def make_bib_key(self, db=None):
"""
Generate the BibTeX key for this entry from BibTeX data
"""
first_author = self.persons["author"][0]
last_name = "".join(first_author.last_names)
last_name = unidecode(last_name)
last_name = re.sub(r"[ {}`'\"\\]", "", last_name)
year = self.fields["year"]
journal = self.ads_record.get_bibstem()
bibkey = "".join([last_name, year, journal])
if db and db.exists_key(bibkey):
num = 2
while db.exists_key(bibkey+str(num)):
num += 1
bibkey += str(num)
logger.info("Generated BibTeX key: {0}".format(bibkey))
self.bibkey = bibkey
def scrape_thread_list(self, threads, count):
for t in threads['data']:
extra_params = (('&since=' + self.since) if self.since else '') + (('&until=' + self.until) if self.until else '')
url = self.build_url('{}/messages?fields=from,created_time,message,shares,attachments&limit=400' + extra_params, t['id'])
print("GET", unidecode.unidecode(t['participants']['data'][0]['name']), t['id'])
thread = self.scrape_thread(url, [])
if thread:
self.writer.writerow({
# 'page_id': t['participants']['data'][1]['id'],
# 'page_name': t['participants']['data'][1]['name'],
# 'user_id': t['participants']['data'][0]['id'],
# 'user_name': t['participants']['data'][0]['name'],
'url': t['link'],
})
id_map = {p['id']: p['name'] for p in t['participants']['data']}
for message in reversed(thread):
message['from'] = id_map[message['from_id']]
self.writer.writerow(message)
next = threads.get('paging', {}).get('next', '')
if next and count > 1:
self.scrape_thread_list(requests.get(next).json(), count - 1)
def test_objects(id, al=3, name=u"Default"):
logger.info("Preparing to test the results for %s (%s/%s)", clean(name), al, id)
if forceTrue:
if id == forceTrueID:
logger.error("Overriding test for %s", forceTrueID)
return True
testOB = nullShape
if True:
testOB = build_object(id,al,name)
if track.within(testOB):
logger.info(u"Track is within %s (%s/%s) place.BBOX(%s)/track.BBOX(%s)", clean(name), al, id, testOB.bounds, track.bounds )
print u"Within {0} ({3}) ({2}/{1})".format(name, id, al, unidecode(unicode(clean(name))))
return True
elif track.intersects(testOB):
logger.info(u"Track intersects with %s (%s/%s) place.BBOX(%s)/track.BBOX(%s)", clean(name), al, id, testOB.bounds, track.bounds )
print u"Intersects {0} ({3}) ({2}/{1})".format(name, id, al, unidecode(unicode(clean(name))))
return True
logger.info("Rejecting %s (%s/%s) place.BBOX(%s)/track.BBOX(%s)!!!", clean(name), al, id, testOB.bounds, track.bounds )
return False
def get_students(csv_file):
"""
:param csv_file: csv file with list of students.\
Each row contains: first_name, last_name, email
:type csv_file: str
:rtype: 2 lists existing_students and new_students [[username, email], ..]
"""
with open(csv_file) as ff:
reader = csv.reader(ff, delimiter=',')
existing_students = []
new_students = []
for i, row in enumerate(reader):
row = [unidecode(x.strip()) for x in row[:3]]
username = "_".join(row[:2])
username = username.replace(" ", "_")
email = row[2]
try:
u = User.objects.get(username=username)
Student.objects.get(user=u)
existing_students.append([u.username, u.email])
except ObjectDoesNotExist:
new_students.append([username, email])
return existing_students, new_students
def http_quote(string):
"""
Given a unicode string, will do its dandiest to give you back a
valid ascii charset string you can use in, say, http headers and the
like.
"""
if isinstance(string, six.text_type):
try:
import unidecode
except ImportError:
pass
else:
string = unidecode.unidecode(string)
string = string.encode('ascii', 'replace')
# Wrap in double-quotes for ; , and the like
string = string.replace(b'\\', b'\\\\').replace(b'"', b'\\"')
return '"{0!s}"'.format(string.decode())
def http_quote(string):
"""
Given a unicode string, will do its dandiest to give you back a
valid ascii charset string you can use in, say, http headers and the
like.
"""
if isinstance(string, six.text_type):
try:
import unidecode
except ImportError:
pass
else:
string = unidecode.unidecode(string)
string = string.encode('ascii', 'replace')
# Wrap in double-quotes for ; , and the like
string = string.replace(b'\\', b'\\\\').replace(b'"', b'\\"')
return '"{0!s}"'.format(string.decode())
def close_words(W, X, labels, top_n=6):
'''
Find words that are close to each label.
W is a gensim.word2vec
X is the document vectors.
labels are predetermined cluster labels.
'''
L = []
for label in np.unique(labels):
label_idx = labels == label
mu = X[label_idx].mean(axis=0)
dist = W.wv.syn0.dot(mu)
idx = np.argsort(dist)[::-1][:top_n]
words = [W.wv.index2word[i] for i in idx]
L.append(' '.join(words))
# Map unicode to simple ASCII
L = map(unidecode, L)
# Remove _PHRASE
L = map(lambda x: x.replace('PHRASE_', ''), L)
return L
def make_filename(string):
"""
Turn a string into something that can be safely used as a file or directory
name.
:param string: The string to convert.
:return: The sanitised string.
:raises ValueError: If string is None.
"""
if string is None:
raise ValueError('String cannot be None')
safe = [' ', '.', '_', '-', '\'']
joined = ''.join([c for c in unidecode.unidecode(string)
if c.isalnum() or c in safe]).strip()
if not joined:
raise ValueError('Filename would be empty')
return joined
def reset_groups_conf(self, group_name=None):
if group_name and isinstance(group_name, str) and group_name != '':
if not isinstance(group_name, unicode):
group_name = group_name.decode(sg.DEFAULT_CHARSET)
flat_name = filter(str.isalnum, unidecode.unidecode(group_name.lower()))
sg.logger.info('Reseting conf for group %s...' % flat_name)
try:
group = sg.db.session.query(GROUP).filter(GROUP.flat_name == flat_name).one()
self.__push_group_conf(group, True)
except NoResultFound as e:
sg.logger.warning('No group %s, aborting reset confs...' % (flat_name))
else:
sg.logger.info('Reseting conf for all groups...')
groups = sg.db.session.query(GROUP).all()
for group in groups:
self.__push_group_conf(group, True)
# Routine for pushing conf to a group
def _create_field(self, record_node, field, data):
if data == None:
return
l_field = field.lower()
if l_field in adif_field:
if adif_field[l_field] == 'D':
tmp_data = data.strftime('%Y%m%d')
elif adif_field[l_field] == 'T':
tmp_data = data.strftime('%H%M%S')
elif adif_field[l_field] == 'B':
tmp_data = 'Y' if data else 'N'
else:
tmp_data = str(data)
if l_field in adif_rev_utf_field:
record_node.appendChild(self._create_node(adif_rev_utf_field[l_field], tmp_data))
record_node.appendChild(self._create_node(l_field, unidecode(tmp_data)))
elif l_field.startswith('app_'):
tmp_data = str(data)
record_node.appendChild(self._create_node(l_field, tmp_data))
else:
raise WriteError('unknown field: \'%s\'' % l_field)
def series_to_ascii(series):
"""Change columns to lowercase strings inplace.
Arguments:
series (pandas.Series): series to be modified.
Returns:
pandas.Series: series with lowercase and no symbols.
"""
warnings.warn("Function will be deprecated because it is not used.",
category=DeprecationWarning)
series = series.copy(True)
series = series.apply(unidecode)
series = series.str.lower()
series = series.str.replace('[^a-zA-Z0-9_]', '_')
return series
def scan(self):
log.info('Cronos extract: %s', self.path_name)
target_dir = os.environ.get('CRONOS_OUTDIR')
if target_dir is None:
log.warning('No CRONOS_OUTDIR is set.')
return
sub_dir = slugify(unidecode(self.path_name), '_')
target_dir = os.path.join(target_dir, sub_dir)
try:
os.makedirs(target_dir)
except:
pass
try:
parse(self.real_path, target_dir)
except Exception as ex:
log.exception(ex)
def professors_handler(bot, update):
msg = update.message.text
msg = msg.split(' ')
if len(msg)>=2:
professor_name = unidecode(" ".join(msg[1:]))
if len(professor_name)>3:
search_result = [professor for professor in professors if professor_name.upper() in professor['Nome'].upper()]
if len(search_result)>0:
bot.sendMessage(update.message.chat_id, text='Sono stati trovati %d professori '\
'con la tua ricerca' % len(search_result))
descr=""
for p in search_result:
descr += "Nome: %s\nQualifica: %s\nDipartimento: %s\n" % (p['Nome'], p['Qualifica'], p['Dipartimento'])
descr+= "Indirizzo: %s\nEmail: %s\nTelefono: %s\n" % (p['Indirizzo'], p['Email'], p['Telefono'])
descr+= "Sito: %s\nSSD: %s\n\n" % (p['Sito'], p['SSD'])
bot.sendMessage(update.message.chat_id,text= descr)
else:
bot.sendMessage(update.message.chat_id, text='Professore non trovato')
else:
bot.sendMessage(update.message.chat_id, text='Inserisci almeno 4 caratteri per la ricerca')
else:
bot.sendMessage(update.message.chat_id, text="Devi inserire il professore su cui ottenere informazioni!\n/prof <nome cognome>")
def classroom_handler(bot, update):
msg = update.message.text
msg = msg.split(' ')
if len(msg)==2:
insegnamento_name=unidecode(" ".join(msg[1:]))
if len(insegnamento_name)>3:
search_result=[insegnamento for insegnamento in classrooms if insegnamento_name.upper() in insegnamento['Nome'].upper()]
if len(search_result)>0:
bot.sendMessage(update.message.chat_id, text='Sono stati trovati %d insegnamenti con la tua ricerca' % len(search_result))
descr=""
for m in search_result:
doc=''.join([docente+'\n' for docente in m['Docenti']])
descr += "Nome: %s\nSemestre: %s\nCorso di Laurea: %s\n" % (m['Nome'], m['Semestre'], m['Corso di Laurea'])
descr+= "Anno: %s\nDocenti: %s\nSSD: %s\n" % (m['Anno'], doc, m['SSD'])
descr+= "CFU: %s\n\n" % (m['CFU'])
bot.sendMessage(update.message.chat_id, text=descr)
else:
bot.sendMessage(update.message.chat_id, text='Insegnamento non trovato')
else:
bot.sendMessage(update.message.chat_id, text='Inserisci almeno 4 caratteri per la ricerca')
else:
bot.sendMessage(update.message.chat_id, text="Devi inserire l'insegnamento su cui ottenere informazioni!\n/insegnamento <nome>")
def courses_handler(bot,update):
msg = update.message.text
msg = msg.split(' ')
if len(msg)==2:
nome_corso = unidecode(msg[1])
if len(nome_corso)>3:
search_result = [corso for corso in courses if nome_corso.upper() in corso['Denominazione'].upper()]
if len(search_result)>0:
bot.sendMessage(update.message.chat_id, text='Sono stati trovati %d corsi con la tua ricerca' % len(search_result))
descr=""
for corso in search_result:
descr+="Nome: %s\nID: %s\n" % (corso['Denominazione'], corso['ID'])
descr+="Codice: %s\nOrdinamento: %s\n Tipo: %s\n\n" % (corso['Codice'], corso['Ordinamento'], corso['Tipo'])
bot.sendMessage(update.message.chat_id, text=descr)
else:
bot.sendMessage(update.message.chat_id, text='Corso non trovato')
else:
bot.sendMessage(update.message.chat_id, text='Inserisci almeno 4 caratteri per la ricerca')
else:
bot.sendMessage(update.message.chat_id, text="Devi inserire il corso su cui ottenere informazioni!\n/corso <nome>")
def exams_handler(bot,update):
msg = update.message.text
msg = msg.split(' ')
if len(msg)==2:
cds_id = unidecode(msg[1])
search_result=[esame for esame in exams if cds_id==str(esame['CDS_ID'])]
if len(search_result)>0:
bot.sendMessage(update.message.chat_id, text='Sono stati trovati %d esami con la tua ricerca' % len(search_result))
for esame in search_result:
descr="Materia: %s\nData: %s\nOra: %s\n" % (esame['Insegnamento'], esame['Data'], esame['Ora'])
descr+='Aula: %s\n Scaglione: %s\nTipo: %s\nTipo Appello:%s\n\n' % (esame['Aula'], esame['Scaglione'], esame['Tipo Esame'], esame['Appello'])
bot.sendMessage(update.message.chat_id, text=descr)
else:
bot.sendMessage(update.message.chat_id, text="Corso non trovato verifica di aver inserito l'id corretto")
else:
bot.sendMessage(update.message.chat_id, text="Inserisci l'id del corso, lo puoi conoscere usando il comando corsi")
def cleanUnicode(string):
try:
try:
#string = str(string)
if isinstance(string, unicode):
unicode_replaced_str = string.decode('utf-8')
elif isinstance(string, str):
unicode_replaced_str = string.decode('utf-8')
import unidecode
unicode_replaced_str = unidecode.unidecode(unicode_replaced_str)
string = unicode_replaced_str
except:
pass
fixed_string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore' )
return fixed_string
except:
return string
#interface:
def parse(self, text, company_name):
soup = BeautifulSoup(text, 'lxml')
lis = soup.findAll('li', {'class':'search-result'})
for item in lis:
name = item.find('span', {'class':'actor-name'})
name = name.text if name else "??"
occupation = item.find('p', {'class':'search-result__snippets'})
occupation = occupation.text.replace('\n', ' ') if occupation else "??"
try:
print('[+] :: {} :: {}'.format(unidecode(name), unidecode(occupation)))
self.filewrite('[+] :: {} :: {}\n'.format(unidecode(name), unidecode(occupation)))
except Exception as e:
print('[+] :: {} :: {}\n'.format(unidecode(name.encode('utf-8', 'replace')),
unidecode(occupation.encode('utf-8', 'replace'))))
self.filewrite('[+] :: {} :: {}\n'.format(unidecode(name.encode('utf-8', 'replace')),
unidecode(occupation.encode('utf-8', 'replace'))))
def _normalize_coerce_zpl(self, value):
"""Sanitze input for ZPL.
Remove ZPL ctrl caraters
Remove accents
"""
if not isinstance(value, basestring):
return value
ctrl_cars = [
0xFE, # Tilde ~
0x5E, # Caret ^
0x1E, # RS (^ substitution)
0x10, # DLE (~ substitution)
]
val = unidecode(value)
for ctrl in ctrl_cars:
val = val.replace("%c" % ctrl, "")
return val
def ConvertToPlainText_Chunks(self, p_output_dir, p_file_number, p_chunk=True, p_chunk_size=5000):
file_name = self.GetFilename()
file_ext = self.GetFileExtension()
output_lines = self.GetPreparedLines()
# Optional line chunking
chunks = []
if p_chunk:
chunks = Utils_MalletInterpret.GetChunkedLines(output_lines, p_chunk_size)
else:
chunks.append(output_lines)
# Write out files
for index in range(len(chunks)):
with open("{0}{1}_{2}_{3}{4}".format(p_output_dir, p_file_number, file_name, index, file_ext), 'w') as plaintext_output_file:
for line in chunks[index]:
plaintext_output_file.write(unidecode(line) + u"\n")
p_file_number += 1
return len(chunks)
def output_preprocessed_data(self, json_input, file_name):
'''
Output preprocessed data into a file.
:param json_input: json formatted data generated from function str_process
:param file_name: output file name
:return: None
'''
rows = []
for sent in json_input['sentences']:
parsed_sent = " ".join([t['originalText'] + "/" + t['pos'] for t in sent['tokens']])
rows.append(parsed_sent)
output_file_path = self.output_folder + '/' + file_name
if os.path.exists(output_file_path):
open(output_file_path, 'w').close()
with open(output_file_path, 'a') as preprocessed_out:
for r in rows:
preprocessed_out.write(unidecode.unidecode(r) + "\n")
def preprocess(post):
# example
# {(romeo and juliet 2013),(romeo and juliet),(douglas booth),(hailee steinfeld)}"
# -> romeo and juliet 2013 romeo and juliet douglas booth hailee steinfeld
print post
# remove all punctuations
post = PUNCTUATION.sub(' ', utils.to_unicode(post))
# replace all emoji characters to '_EMOTICON_' and add space in between.
post = EMOTICON.sub(' _emoticon_ ', post)
# convert all special characters to ascii characters
post = unidecode(post).decode('ascii', 'ignore')
# remove all whitespace into single one
post = WHITESPACE.sub(' ', post).strip()
return utils.to_unicode(post)
def _sanitize(self, text):
# removing duplicated spaces
text = ' '.join(text.split())
# removing digits
text = ''.join([c for c in text if not c.isdigit()])
# removing accents
text = unidecode(text)
# removnig punctuations
text = text.translate(
string.maketrans("-'", ' ')).translate(None,
string.punctuation)
# remove uppercase
text = text.lower()
return text
def form_valid(self, form):
form = PartialNewPostForm(self.request.POST)
post = form.save(commit=False)
post.author = self.request.user
post.slug = unidecode(post.title)
post.slug = slugify(post.slug)
post.save()
if self.request.POST['tags_field']:
tags = self.request.POST['tags_field'].replace(', ', ',').split(',')
for tag_name in tags:
tag = Tag()
tag.post = post
tag.name = tag_name
tag.save()
self.success_url = "/post/" + post.slug
return super(NewPostView, self).form_valid(form)
def search(keywords, lang):
formated_keywords = [ unidecode.unidecode(keyword).lower() for keyword in keywords ]
with model.laima_db.transaction():
query = (model.CardText
.select(model.CardText, model.CardData)
.join(model.CardData)
.switch(model.CardText)
.where(model.CardText.lang == lang)
.join(model.CardTextTag)
.join(model.Tag)
.where(model.Tag.name << formated_keywords)
.group_by(model.CardText)
.having(fn.Count(model.Tag.id) == len(keywords))
.order_by(model.CardText.name))
if query.exists():
count = query.count()
cards = [ card for card in query ]
return cards, count
else:
return [], 0
def log(self, txt = '', level=xbmc.LOGDEBUG):
''' Log a text into the Kodi-Logfile '''
try:
if self.detailLevel > 0 or level == xbmc.LOGERROR:
if self.detailLevel == 2 and level == xbmc.LOGDEBUG:
# More Logging
level = xbmc.LOGNOTICE
elif self.detailLevel == 3 and (level == xbmc.LOGDEBUG or level == xbmc.LOGSEVERE):
# Complex Logging
level = xbmc.LOGNOTICE
if level != xbmc.LOGSEVERE:
if isinstance(txt, unicode):
txt = unidecode(txt)
xbmc.log(b"[%s] %s" % (self.pluginName, txt), level)
except:
xbmc.log(b"[%s] Unicode Error in message text" % self.pluginName, xbmc.LOGERROR)
def create_file(path, list_to_save):
f = open(path, 'w')
headers = []
for entry in list_to_save:
for key, val in entry.items():
if not key in headers:
headers.append(key)
headline = ";".join(headers) + '\n'
f.write(headline)
for entry in list_to_save:
line = ''
for header in headers:
if header in entry:
line += entry[header]
line += ';'
try:
line = unidecode(line)
f.write(line + "\n")
except Exception as e:
print(e)
def __call__(self, unicode_text):
'''
Runs the parser.
Args:
unicode_text: a unicode document
Returns:
text: An ascii equivalent of unicode_text
'''
return unidecode.unidecode(unicode_text)
# if __name__ == "__main__":
# text = u"?-Helix ?-sheet ?? ?? ?? ?? ?? ??"
# parser = unidecoder()
# print(parser(text))