def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = {ss: ii for ii, ss in enumerate(b)}
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
python类distance()的实例源码
def levenshtein_distance(a, b):
"""Return the Levenshtein edit distance between two strings *a* and *b*."""
if a == b:
return 0
if len(a) < len(b):
a, b = b, a
if not a:
return len(b)
previous_row = range(len(b) + 1)
for i, column1 in enumerate(a):
current_row = [i + 1]
for j, column2 in enumerate(b):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (column1 != column2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def getStopFromString(self, candidate):
normalizedCandidate = Stop.normalizeStopName(candidate)
if not Tpg.getTodaysStops():
return None
for stop in Tpg.getTodaysStops():
if candidate.upper() == stop.code:
return stop
if normalizedCandidate == stop.normalizedName:
return stop
for stop in Tpg.getTodaysStops():
if normalizedCandidate in stop.normalizedName:
return stop
# calculate the Levenshtein distance to all stop names
codeToLevenshtein = {stop: Levenshtein.distance(
normalizedCandidate, stop.normalizedName) for stop in Tpg.getTodaysStops()}
# smallest Levenshtein distance
minimum = min(codeToLevenshtein, key=codeToLevenshtein.get)
return minimum
def test_parse(self):
for file in os.listdir(SAMPLE_DIR):
if not file.endswith('.rst'):
continue
filename = os.path.join(SAMPLE_DIR, file)
article = parse_article(filename)
rendered = article.render().strip()
with open(filename) as f:
source = f.read().strip()
source = source.expandtabs(4).decode('utf8')
if source != rendered:
lev_ = distance(source, rendered)
jaro_ = jaro(source, rendered)
if lev_ > 10 and jaro_ < 0.8 and file not in MUTATED_FILES:
print('%d %f %s' % (lev_, jaro_, filename))
raise AssertionError(filename)
edit_distance.py 文件源码
项目:tensorflow_end2end_speech_recognition
作者: hirofumi0810
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def compute_edit_distance(session, labels_true_st, labels_pred_st):
"""Compute edit distance per mini-batch.
Args:
session:
labels_true_st: A `SparseTensor` of ground truth
labels_pred_st: A `SparseTensor` of prediction
Returns:
edit_distances: list of edit distance of each uttearance
"""
indices, values, dense_shape = labels_true_st
labels_pred_pl = tf.SparseTensor(indices, values, dense_shape)
indices, values, dense_shape = labels_pred_st
labels_true_pl = tf.SparseTensor(indices, values, dense_shape)
edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True)
edit_distances = session.run(edit_op)
return edit_distances
edit_distance.py 文件源码
项目:tensorflow_end2end_speech_recognition
作者: hirofumi0810
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def compute_per(ref, hyp, normalize=True):
"""Compute Phone Error Rate.
Args:
ref (list): phones in the reference transcript
hyp (list): phones in the predicted transcript
normalize (bool, optional): if True, divide by the length of str_true
Returns:
per (float): Phone Error Rate between str_true and str_pred
"""
# Build mapping of phone to index
phone_set = set(ref + hyp)
phone2char = dict(zip(phone_set, range(len(phone_set))))
# Map phones to a single char array
# NOTE: Levenshtein packages only accepts strings
phones_ref = [chr(phone2char[p]) for p in ref]
phones_hyp = [chr(phone2char[p]) for p in hyp]
per = lev.distance(''.join(phones_ref), ''.join(phones_hyp))
if normalize:
per /= len(ref)
return per
def inference(predictions_op, true_labels_op, display, sess):
""" Perform inference per batch on pre-trained model.
This function performs inference and computes the CER per utterance.
Args:
predictions_op: Prediction op
true_labels_op: True Labels op
display: print sample predictions if True
sess: default session to evaluate the ops.
Returns:
char_err_rate: list of CER per utterance.
"""
char_err_rate = []
# Perform inference of batch worth of data at a time.
[predictions, true_labels] = sess.run([predictions_op,
true_labels_op])
pred_label = sparse_to_labels(predictions[0][0])
actual_label = sparse_to_labels(true_labels)
for (label, pred) in zip(actual_label, pred_label):
char_err_rate.append(distance(label, pred)/len(label))
if display:
# Print sample responses
for i in range(ARGS.batch_size):
print(actual_label[i] + ' vs ' + pred_label[i])
return char_err_rate
def get_strings_for_search(value):
"""
Returns all statements which have a substring of the given value
:param value: String
:return: dict() with Statements.uid as key and 'text', 'distance' as well as 'arguments' as values
"""
tmp_dict = OrderedDict()
db_statements = get_not_disabled_statement_as_query().join(TextVersion, Statement.textversion_uid == TextVersion.uid).all()
for stat in db_statements:
if value.lower() in stat.textversions.content.lower():
# get distance between input value and saved value
rd = __get_fuzzy_string_dict(current_text=value, return_text=stat.textversions.content, uid=stat.uid)
tmp_dict[str(stat.uid)] = rd
tmp_dict = __sort_dict(tmp_dict)
return_index = list(islice(tmp_dict, list_length))
return_dict = OrderedDict()
for index in return_index:
return_dict[index] = tmp_dict[index]
return return_dict
def get_strings_for_public_nickname(value, nickname):
"""
Returns dictionaries with public nicknames of users, where the nickname containts the value
:param value: String
:param nickname: current users nickname
:return: dict()
"""
db_user = DBDiscussionSession.query(User).filter(func.lower(User.public_nickname).contains(func.lower(value)),
~User.public_nickname.in_([nickname, 'admin', nick_of_anonymous_user])).all()
return_array = []
for index, user in enumerate(db_user):
dist = get_distance(value, user.public_nickname)
return_array.append({'index': index,
'distance': dist,
'text': user.public_nickname,
'avatar': get_public_profile_picture(user)})
return_array = __sort_array(return_array)
return return_array[:list_length]
def __sort_array(list):
"""
Returns sorted array, based on the distance
:param list: Array
:return: Array
"""
return_list = []
newlist = sorted(list, key=lambda k: k['distance'])
if mechanism == 'SequenceMatcher': # sort descending
newlist = reversed(newlist)
# add index
for index, dict in enumerate(newlist):
dict['index'] = index
return_list.append(dict)
return return_list
def __sort_dict(dictionary):
"""
Returns sorted dictionary, based on the distance
:param dictionary: dict()
:return: dict()
"""
dictionary = OrderedDict(sorted(dictionary.items()))
return_dict = OrderedDict()
for i in list(dictionary.keys())[0:return_count]:
return_dict[i] = dictionary[i]
if mechanism == 'SequenceMatcher': # sort descending
return_dict = OrderedDict(sorted(dictionary.items(), key=lambda kv: kv[0], reverse=True))
else: # sort ascending
return_dict = OrderedDict()
for i in list(dictionary.keys())[0:return_count]:
return_dict[i] = dictionary[i]
return return_dict
def getSignificantItems(item_list):
tokenised_list = []
logging.info('Tokenising input data.')
for item in item_list:
tokenised_list.append(tokeniseUrl(item))
items = np.asarray(item_list)
tokenised_items = np.asarray(tokenised_list)
logging.info('Calculating Levenshtein distances between items.')
lev_similarity = -1*np.array([[Levenshtein.distance(i1,i2) for i1 in tokenised_items] for i2 in tokenised_items])
logging.info('Applying affinity propagation to data.')
aff_prop = sklearn.cluster.AffinityPropagation(affinity='precomputed', damping=0.7)
aff_prop.fit(lev_similarity)
logging.info('Completed! Assembling list.')
output_list = []
for cluster_id in np.unique(aff_prop.labels_):
exemplar = items[aff_prop.cluster_centers_indices_[cluster_id]]
output_list.append(exemplar)
return output_list
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
merge_ocr_subtitle.py 文件源码
项目:video_subtitle_extract
作者: thewintersun
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def maybe_same(str1,str2):
'''??2????????????'''
if len(str1) > len(str2):
temp = str1
str1 = str2
str2 = temp
#??????????????
if float(len(str2))/ len(str1) > 2 and len(str1)>=4:
return False
#????????2, ?????
distance = Levenshtein.distance(str1,str2)
if distance <= 3 and len(str1)>=10:
return True
if distance <= 4 and len(str1)>=13:
return True
if distance <= 1 and len(str1)>=5:
return True
if distance > 2 and len(str1)<=6:
return False
if distance > 3:
return False
return True
def find_knn(self, train_strings, train_labels, test_strings):
"""Find 3 nearest neighbors of each item in test_strings in
train_strings and report their labels as the prediction.
Args:
train_strings (ndarray): Numpy array with strings in training set
train_labels (ndarray): Numpy array with labels of train_strings
test_strings (ndarray): Numpy array with string to be predict for
"""
prediction = np.zeros((len(test_strings), self.num_classes))
for i in range(len(test_strings)):
a_str = test_strings[i]
dists = np.array([0] * len(train_strings))
for j in range(len(train_strings)):
b_str = train_strings[j]
dists[j] = lev.distance(a_str, b_str)
# finding the top 3
top3 = dists.argsort()[:3]
for ind in top3:
prediction[i][self.column_index[train_labels[ind]]] += 1.0 / 3
return prediction
def gitignores(*args):
to_send = []
gitignore_list = list()
for arg in set(args):
if arg in gitignore_list:
to_send.append(arg)
elif __name__ == '__main__':
possibles = []
for gitignore in gitignore_list:
if Levenshtein.distance(gitignore, arg) == 1:
possibles.append(gitignore)
print('WARNING: {} is not in gitignore list.'.format(arg), file=sys.stderr, end='')
if possibles:
if len(possibles) == 1:
possible_string = possibles[0]
else:
possible_string = ', '.join(possibles[:-1]) + ' or ' + possibles[-1]
print(' Did you mean {}?'.format(possible_string), file=sys.stderr)
else:
print('', file=sys.stderr)
if not to_send:
return '\n'
text = _get_text_from_url('{}/{}'.format(API_URL, ','.join(to_send)))
return '\n'.join(text.split('\n')[2:])
def prune_useless_elements(path_root):
to_remove = []
for c in path_root.children:
for useless in USELESS_KEYWORDS:
if c.id and (distance(c.id, useless) <= MAX_DISTANCE or useless in c.id):
#print('Removing {0} because of id {1}'.format(c, useless))
to_remove.append(c)
if c.cls:
for cl in c.cls:
if distance(cl, useless) <= MAX_DISTANCE or useless in cl:
#print('Removing {0} because of class name {1}'.format(c, cl))
to_remove.append(c)
path_root.children = [c for c in path_root.children if c not in to_remove]
for c in path_root.children:
prune_useless_elements(c)
def distanceDomain(domain, DomainDict, ccTldDict, tldDict):
similarDomain = ""
minDistance = sys.maxint
level = domain.split(".")
if len(level) <=1:
return ("not a domain", sys.maxint)
(domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict)
for popularDomain in DomainDict:
distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8'))
if distance < minDistance:
minDistance = distance
similarDomain = popularDomain
#debug
#sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance))
if len(similarDomain) > 0:
return (similarDomain, minDistance/float(len(similarDomain)))
else:
return (domain2LD, 0)
# check whether a domain contains invalid TLD
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
s1 = s1.replace(' ','')
s2 = s2.replace(' ','')
b = set(s1.split('<space>') + s2.split('<space>'))
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
s1 = s1.replace(' ','')
s2 = s2.replace(' ','')
b = set(s1.split('<space>') + s2.split('<space>'))
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
s1 = s1.replace(' ','')
s2 = s2.replace(' ','')
b = set(s1.split('<space>') + s2.split('<space>'))
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
def distance(self):
if not self._distance:
self._distance = distance(self._str1, self._str2)
return self._distance
def levenshtein(self, msg, args):
"""Calculate levenshtein distance between two words"""
if len(args) == 2:
result = "Levenshtein distance: " + str(pylev.distance(args[0],args[1]))
else:
result = "Two words are needed to calculate Levenshtein distance"
return result
def filter_hits_by_distance(hits, source_text,
min_similarity=DEFAULT_MIN_SIMILARITY):
"""Returns ES `hits` filtered according to their Levenshtein distance
to the `source_text`.
Any hits with a similarity value (0..1) lower than `min_similarity` will be
discarded. It's assumed that `hits` is already sorted from higher to lower
score.
"""
if min_similarity <= 0 or min_similarity >= 1:
min_similarity = DEFAULT_MIN_SIMILARITY
filtered_hits = []
for hit in hits:
hit_source_text = hit['_source']['source']
distance = Levenshtein.distance(source_text, hit_source_text)
similarity = (
1 - distance / float(max(len(source_text), len(hit_source_text)))
)
logger.debug(
'Similarity: %.2f (distance: %d)\nOriginal:\t%s\nComparing with:\t%s',
similarity, distance, source_text, hit_source_text
)
if similarity < min_similarity:
break
filtered_hits.append(hit)
return filtered_hits
def fuzzy_match(self, locale, condition_name):
condition_name = self.normalize_input(condition_name)
conditions_candidates = self.get_condition_candidates(locale, condition_name)
sorted_candidates = sorted(conditions_candidates.items(),
cmp=lambda x, y: Levenshtein.distance(condition_name, x[1]) - Levenshtein.distance(
condition_name, y[1]))
return sorted_candidates[0][0]
def get_condition_candidates(self, locale, condition_name):
return {condition: min(mappings[condition][locale], key=lambda s: Levenshtein.distance(condition_name, s)) for
condition in list(SnipsWeatherConditions)}
def match_something(item, list):
item = item.replace(" ","")
item = item.replace(".", "")
item = item.replace(",", "")
lowest = list[0]
lowestdelta = Levenshtein.distance(item, list[0])
for entry in list:
delta = Levenshtein.distance(item, entry)
if delta < lowestdelta:
lowestdelta = delta
lowest = entry
print(delta, item, entry)
return lowest