def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
python类eval()的实例源码
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def total_distance(observed_sentence, corrected_sentence):
"""Calculates the total distance between the two given sentences.
Args:
observed_sentence: Observed sentence.
corrected_sentence: Corrected sentence.
Returns:
Total Levenshtein distance between the two sentences.
"""
total_distance = 0
observed_words = list(observed_sentence)
corrected_words = list(corrected_sentence)
for i in range(len(observed_words)):
comparable_words = observed_words[i], corrected_words[i]
total_distance += editdistance.eval(*comparable_words)
return total_distance
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
image_ocr_gpu.py 文件源码
项目:keras-mxnet-benchmarks
作者: sandeep-krishnamurthy
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def similarities(self):
"""
Compute Levenshtein distance matrix between files (implemented in C++ pip package: editdistance)
Later: https://docs.python.org/2/library/difflib.html
:return:
"""
ucos = sorted(self.filedb.keys())
sims = {}
for idx, uco in enumerate(ucos):
logger.info('Comparing %s...' % uco)
sims[uco] = {}
for idx2, uco2 in enumerate(ucos[idx+1:]):
dist = editdistance.eval(self.file_data[uco], self.file_data[uco2])
sims[uco][uco2] = dist
logger.info(' %6d vs %6d : %4d %s %s' % (uco, uco2, dist, self.filedb[uco], self.filedb[uco2]))
def best_match(word, corrected_med_list, corrected_english_list):
min_dist_med = len(word)
best_med_word = ''
min_dist_eng = len(word)
best_eng_word = ''
for word_t in corrected_med_list:
if editdistance.eval(word, word_t) < min_dist_med:
min_dist_med = editdistance.eval(word, word_t)
best_med_word = word_t
for word_t in corrected_english_list:
if editdistance.eval(word, word_t) < min_dist_eng:
min_dist_eng = editdistance.eval(word, word_t)
best_eng_word = word_t
if min_dist_med <= min_dist_eng:
return best_med_word
else:
return best_eng_word
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def compare_strings_by_edit_distance(first=None, second=None):
"""
Get the edit distance between the two strings passed to this method.
:param first: The first string to compare.
:param second: The second string to compare.
:return: A number representing the edit distance between the two strings passed
as arguments to this method.
"""
return editdistance.eval(first, second)
# Class Methods
# Public Methods
# Protected Methods
# Private Methods
# Properties
# Representation and Comparison
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def simscore(a1, b1):
max_len = max([len(a1), len(b1)])
if max_len == 0:
return 0
dist = editdistance.eval(a1, b1)
if dist > max_len:
print dist
return 1.0 - (float(dist)/float(max_len))
def similarity(a1, b1):
max_len = max([len(a1), len(b1)])
if max_len == 0:
return 0
dist = editdistance.eval(a1, b1)
return 1.0 - (float(dist)/float(max_len))
def letter_error_count(self) -> float:
return editdistance.eval(self.expected, self.predicted)
def word_error_count(self) -> float:
return editdistance.eval(self.expected_words, self.predicted.split())
def edit_dis(a, b):
return editdistance.eval(a, b)
def getEditDistanceMat(gtTranscriptions,sampleTranscriptions):
outputShape=[len(gtTranscriptions),len(sampleTranscriptions)]
distMat=np.empty(outputShape)
maxSizeMat=np.empty(outputShape)
for gtNum in range(len(gtTranscriptions)):
for sampleNum in range(len(sampleTranscriptions)):
distMat[gtNum,sampleNum]=editdistance.eval(gtTranscriptions[gtNum],sampleTranscriptions[sampleNum])
maxSizeMat[gtNum,sampleNum]=max(len(gtTranscriptions[gtNum]),len(sampleTranscriptions[sampleNum]))
return distMat/maxSizeMat,distMat
def _normalized_edit_dist(s1, s2):
return float(editdistance.eval(s1, s2)) / max(len(s1), len(s2), 1)
def compare_cc_list_levenshtein(sample, ref):
"""
Compares the cyclomatic complexity values of all functions in `sample`
with those of all functions in `ref`, by taking the Levenshtein distance
between these lists. This detects added/removed functions and functions
that have changed in complexity between a sample and a reference.
"""
if hasattr(ref, 'cclist') and ref.cclist is not None:
ratio = 1 - (editdistance.eval(sample.cclist, ref.cclist)
/ float(max(len(sample.cclist), len(ref.cclist))))
else:
ratio = 0.0
return (ratio * 100, ref.name, ref.version)
def annotate(self, tokens):
X_focus = self.preprocessor.transform(tokens=tokens)['X_focus']
X_context = self.pretrainer.transform(tokens=tokens)
# get predictions:
new_in = {}
if self.include_token:
new_in['focus_in'] = X_focus
if self.include_context:
new_in['context_in'] = X_context
preds = self.model.predict(new_in)
if isinstance(preds, np.ndarray):
preds = [preds]
annotation_dict = {'tokens': tokens}
if self.include_lemma:
pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=preds[self.lemma_out_idx])
annotation_dict['lemmas'] = pred_lemmas
if self.postcorrect:
for i in range(len(pred_lemmas)):
if pred_lemmas[i] not in self.known_lemmas:
pred_lemmas[i] = min(self.known_lemmas,
key=lambda x: editdistance.eval(x, pred_lemmas[i]))
annotation_dict['postcorrect_lemmas'] = pred_lemmas
if self.include_pos:
pred_pos = self.preprocessor.inverse_transform_pos(predictions=preds[self.pos_out_idx])
annotation_dict['pos'] = pred_pos
if self.include_morph:
pred_morph = self.preprocessor.inverse_transform_morph(predictions=preds[self.morph_out_idx])
annotation_dict['morph'] = pred_morph
return annotation_dict
def searchPackages(name):
results = loadJson('https://www.archlinux.org/packages/search/json/?q=%s' % name)['results']
results = sorted(results, key=lambda x: levdist(name, x['pkgname']))[:100]
packages = [parsePackage(package, name) for package in results if package['arch'] in (arch, 'any')]
results = loadJson('https://aur.archlinux.org/rpc/?v=5&type=search&arg=%s' % name)['results']
results = sorted(results, key=lambda x: levdist(name, x['Name']))[:100]
packages += [parsePackage(package, name) for package in results]
packages = sorted(packages, key=lambda x: levdist(name, x[0]))[:100]
return packages
def set_trimming(self, u, t, use_edit_distance=True):
untrimmed = u.query_sequence.upper()
untrimmed_len = len(untrimmed)
trimmed = t.query_sequence.upper()
trimmed_len = len(trimmed)
trimmed_front = 0 if use_edit_distance else -1
if use_edit_distance and (untrimmed_len > trimmed_len):
for i in range(untrimmed_len - trimmed_len + 1):
if untrimmed[i:(i+trimmed_len)] == trimmed:
trimmed_front = i
break
else:
# Since Skewer performs automatic error correction, the trimmed and
# untrimmed reads may not match, so in that case we find the closest
# match by Levenshtein distance.
dist = None
for i in range(untrimmed_len - trimmed_len + 1):
d = editdistance.eval(untrimmed[i:(i+trimmed_len)], trimmed)
if not dist:
dist = d
elif d < dist:
trimmed_front = i
dist = d
self.trimmed_front = trimmed_front
self.trimmed_back = untrimmed_len - (trimmed_len + trimmed_front)
def edit(seq1, seq2):
"""
Wrapper around editdistance.eval for fast Levenshtein
distance computation.
Args:
seq1 (str): Reference sequence
seq2 (str): Sequence to compare
Examples:
>>> edit('banana', 'bahama')
2
"""
return int(ed.eval(seq1, seq2))
def edit_distance(train_in, test_in, qcolumns = ['question1', 'question2'], append=''):
train = train_in.copy().loc[:,qcolumns]
test = test_in.copy().loc[:,qcolumns]
import editdistance
def my_fun(row, qcolumns):
return editdistance.eval(row[qcolumns[0]], row[qcolumns[1]])
key = 'edit_dist'+append
train[key] = train.apply(lambda x: my_fun(x, qcolumns=qcolumns), axis=1)
test[key] = test.apply(lambda x: my_fun(x, qcolumns=qcolumns), axis=1)
return (train, test)
def bestNameDiff(profileone, profiletwo):
""" Applies Levenshtein distance between best names of two profiles."""
n1 = profileone.bestname()
n2 = profiletwo.bestname()
if (not n1) or (not n2):
return 0
l1 = profileone.name_length
l2 = profiletwo.name_length
diff = editdistance.eval(n1,n2)
return 1-(diff/(l1 if l1 > l2 else l2))
def string_sim(n1, n2):
""" Applies Levenshtein distance between strings."""
if (not n1) or (not n2):
return 0
l1 = len(n1)
l2 = len(n2)
diff = editdistance.eval(n1,n2)
return 1-(diff/(l1 if l1 > l2 else l2))
Mandalorion_12_Create_Consensi.py 文件源码
项目:Mandalorion
作者: christopher-vollmers
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def collect_file_paths(path,gene_file):
genes_of_interest=[]
for line in open(gene_file):
genes_of_interest.append(line.strip())
isoform_list=[]
gene_read_counter={}
isoform_read_counter={}
for gene in genes_of_interest:
gene_read_counter[gene]=0
for file1 in sorted(os.listdir(path+'/parsed_reads')):
if gene in file1:
file2=file1+'_sub'
out_sub=open(path+'/parsed_reads/'+file2,'w')
counter=0
isoform_reads=read_fasta(path+'/parsed_reads/'+file1)
isoform_read_list=list(isoform_reads.keys())
print(gene_read_counter,gene_read_counter[gene],len(isoform_reads.keys()))
gene_read_counter[gene]+=len(isoform_reads.keys())
isoform_read_counter[path+'/parsed_reads/'+file2]=len(isoform_reads.keys())
read1 = isoform_read_list[0]
out_sub.write('>'+read1+'\n'+isoform_reads[read1]+'\n')
for read2 in isoform_read_list[1::]:
if counter<subsample:
out_sub.write('>'+read2+'\n')
dist_1 = editdistance.eval(isoform_reads[read1],isoform_reads[read2])**2/float(len(isoform_reads[read1])*len(isoform_reads[read2]))
dist_2 = editdistance.eval(isoform_reads[read1],reverse_complement(isoform_reads[read2]))**2/float(len(isoform_reads[read1])*len(isoform_reads[read2]))
if dist_1 < dist_2:
out_sub.write(isoform_reads[read2]+'\n')
else:
out_sub.write(reverse_complement(isoform_reads[read2])+'\n')
counter+=1
isoform_list.append((path+'/parsed_reads/'+file2,gene))
return isoform_list,gene_read_counter,isoform_read_counter
def test_simulate_sequencing_errors(self):
"""Test function simulating sequencing errors."""
error_rate = 0.1
error_weights = {'substitution': 1.0 / 6,
'insertion': 1.0 / 6,
'deletion': 4.0 / 6}
sequence = sim_seq.simulate_sequence(5000)
mutated_record = sim_seq.simulate_sequencing_errors(
sequence, error_rate, error_weights)
distance = editdistance.eval(sequence, mutated_record.seq)
expected_errors = len(sequence) * error_rate
errors_sd = np.sqrt(len(sequence) * error_rate * (1 - error_rate))
# Should pass 0.9973 proportion of cases:
self.assertTrue(expected_errors - errors_sd * 3 < distance < expected_errors +
errors_sd * 3, msg="expected: {} realised:{}".format(expected_errors, distance))
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
wrong = 0
right = 0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc], word_batch['labeltype_input'][0:num_proc])
for j in range(0, num_proc):
ocr_result = deaccent(unicode(re.sub("[\+\/]", "", re.sub("\\s", "", decoded_res[j])), 'utf-8'))
gold_label = re.sub("[\+\/]", "", re.sub("\\s", "", word_batch['source_str'][j]))
if gold_label == ocr_result:
right += 1
else:
wrong += 1
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
absacc = float(right) / (float(right) + float(wrong))
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
outline = ' Out of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f\n Absolute accuracy over labels is %0.2f\n' % (
num, mean_ed, mean_norm_ed, absacc)
print(outline)
return mean_norm_ed, absacc
def text_distance(str1, str2):
str1 = normalize_txt(str1)
str2 = normalize_txt(str2)
return editdistance.eval(str1, str2)