def cer(self, s1, s2):
"""
Computes the Character Error Rate, defined as the edit distance.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
return Lev.distance(s1, s2)
python类distance()的实例源码
def compare_strings_concat_levenshtein(sample, ref):
"""
Concatenates all strings from `sample` into one, and all strings
from `ref` into another. They are then compared by their Levenshtein distance.
This results in a fuzzy comparison: it detects changes within strings and
within the list of strings.
"""
if hasattr(ref, 'strs') and ref.strs is not None:
i = 0
ratios = 0
for section in ref.strs:
if section not in sample.strs:
continue
strs_a_concat = ''.join(sample.strs[section])
strs_b_concat = ''.join(ref.strs[section])
if len(strs_a_concat) == 0 or len(strs_b_concat) == 0:
continue
# Similarity meassurement from
# Gheorghescu, M. (2005). An Automated Virus Classification System.
# Virus Bulletin Conference, (October), 294-300.
# (although they use it on a list of basic blocks instead of a
# character string)
ratio_sec = 1 - (Levenshtein.distance(strs_a_concat, strs_b_concat)
/ float(max(len(strs_a_concat), len(strs_b_concat))))
ratios += ratio_sec
i += 1
ratio = ratios / i if i > 0 else 0.0
else:
ratio = 0.0
return (ratio * 100, ref.name, ref.version)
def compare_cc_list_levenshtein(sample, ref):
"""
Compares the cyclomatic complexity values of all functions in `sample`
with those of all functions in `ref`, by taking the Levenshtein distance
between these lists. This detects added/removed functions and functions
that have changed in complexity between a sample and a reference.
"""
if hasattr(ref, 'cclist') and ref.cclist is not None:
ratio = 1 - (editdistance.eval(sample.cclist, ref.cclist)
/ float(max(len(sample.cclist), len(ref.cclist))))
else:
ratio = 0.0
return (ratio * 100, ref.name, ref.version)
def setup_argparser(parser):
parser.add_argument('-m','--patternmodel', type=str,help="Pattern model of a background corpus (training data; Colibri Core unindexed patternmodel)", action='store',required=True)
parser.add_argument('-l','--lexicon', type=str,help="Lexicon file (training data; plain text, one word per line)", action='store',required=False)
parser.add_argument('-L','--lm', type=str,help="Language model file in ARPA format", action='store',required=False)
parser.add_argument('-c','--classfile', type=str,help="Class file of background corpus", action='store',required=True)
parser.add_argument('-k','--neighbours','--neighbors', type=int,help="Maximum number of anagram distances to consider (the actual amount of anagrams is likely higher)", action='store',default=3, required=False)
parser.add_argument('-K','--candidates', type=int,help="Maximum number of candidates to consider per input token/pattern", action='store',default=100, required=False)
parser.add_argument('-n','--topn', type=int,help="Maximum number of candidates to return", action='store',default=10,required=False)
parser.add_argument('-N','--ngrams', type=int,help="N-grams to consider (max value of n). Ensure that your background corpus is trained for at least the same length for this to have any effect!", action='store',default=3,required=False)
parser.add_argument('-D','--maxld', type=int,help="Maximum levenshtein distance", action='store',default=5,required=False)
parser.add_argument('-M','--maxvd', type=int,help="Maximum vector distance", action='store',default=5,required=False)
parser.add_argument('-t','--minfreq', type=int,help="Minimum frequency threshold (occurrence count) in background corpus", action='store',default=1,required=False)
parser.add_argument('-a','--alphafreq', type=int,help="Minimum alphabet frequency threshold (occurrence count); characters occuring less are not considered in the anagram vectors", action='store',default=10,required=False)
parser.add_argument('-b','--beamsize', type=int,help="Beamsize for the decoder", action='store',default=100,required=False)
parser.add_argument('--maxdeleteratio', type=float,help="Do not allow a word to lose more than this fraction of its letters", action='store',default=0.34,required=False)
parser.add_argument('--lexfreq', type=int,help="Artificial frequency (occurrence count) for items in the lexicon that are not in the background corpus", action='store',default=1,required=False)
parser.add_argument('--ldweight', type=float,help="Levenshtein distance weight for candidating ranking", action='store',default=1,required=False)
parser.add_argument('--vdweight', type=float,help="Vector distance weight for candidating ranking", action='store',default=1,required=False)
parser.add_argument('--freqweight', type=float,help="Frequency weight for candidating ranking", action='store',default=1,required=False)
parser.add_argument('--lexweight', type=float,help="Lexicon distance weight for candidating ranking", action='store',default=1,required=False)
parser.add_argument('--lmweight', type=float,help="Language Model weight for Language Model selection (together with --correctionweight)", action='store',default=1,required=False)
parser.add_argument('--correctionweight', type=float,help="Correction Model weight for Language Model selection (together with --lmweight)", action='store',default=1,required=False)
parser.add_argument('--correctscore', type=float,help="The score a word must reach to be marked correct prior to decoding", action='store',default=0.60,required=False)
parser.add_argument('--correctfreq', type=float,help="The frequency a word must have for it to be marked correct prior to decoding",action='store',default=200,required=False)
parser.add_argument('--punctweight', type=int,help="Punctuation character weight for anagram vector representation", action='store',default=1,required=False)
parser.add_argument('--unkweight', type=int,help="Unknown character weight for anagram vector representation", action='store',default=1,required=False)
parser.add_argument('--ngramboost',type=float, help="Boost unigram candidates that are also predicted as part of larger ngrams, by the specified factor",action='store', default=0.25,required=False)
parser.add_argument('-1','--simpledecoder',action='store_true', help="Use only unigrams in decoding")
parser.add_argument('--lmwin',action='store_true', help="Boost the scores of the LM selection (to 1.0) just prior to output")
parser.add_argument('--locallm',action='store_true', help="Use a local LM to select a preferred candidate in each candidate list instead of the LM integrated in the decoder")
parser.add_argument('--blocksize',type=int, action='store', help="Block size: determines the amount of test tokens to process in one go (dimensions of the anavec test matrix), setting this helps reduce memory at the cost of speed (0 = unlimited)",default=1000)
parser.add_argument('--report',action='store_true', help="Output a full report")
parser.add_argument('--json',action='store_true', help="Output JSON")
parser.add_argument('--tok',action='store_true', help="Input is already tokenized")
parser.add_argument('--noout',dest='output',action='store_false', help="Do not output")
parser.add_argument('-d', '--debug',action='store_true')
def PopulateAmCacheTemporalCollaterals(fileName, sqlTweak, DB, collateralDBTableName, reconWindow=3):
countHostsProcessed = 0
# Process each occurrence of the FileName
if sqlTweak is "":
data = DB.Query("SELECT RowID, HostID, FileName, FirstRun from Entries WHERE EntryType = %s AND FileName = '%s'" % (settings.__AMCACHE__, fileName))
else: data = DB.Query("SELECT RowID, HostID, FileName, FirstRun from Entries_FilePaths WHERE EntryType = %s AND FileName = '%s' AND %s" % (settings.__AMCACHE__, fileName, sqlTweak))
rowList = []
countRowsToProcess = len(data)
countRowsProcessed = 0
# Executed before
for row in data:
rowID = row[0]
hostID = row[1]
fileName = row[2]
firstRun = row[3]
# Insert entry into DB
DB.Execute("INSERT INTO " + collateralDBTableName + " VALUES (NULL,%s, 0, 0, 0, 0)" % (rowID))
# Check recon window
countRowsProcessed += 1
update_progress(float(countRowsProcessed) / float(countRowsToProcess), fileName)
minFirstRun = firstRun - datetime.timedelta(0,60 * reconWindow)
maxFirstRun = firstRun + datetime.timedelta(0,60 * reconWindow)
reconEntries = DB.Query("SELECT RowID, HostID, FileName, FirstRun FROM Entries WHERE EntryType = %s AND (FirstRun >= '%s' AND FirstRun <= '%s')" % (settings.__AMCACHE__, minFirstRun, maxFirstRun))
# Filter out incorrect correlations when RowID jumps from one host to the next
# Weight correlation value according to temporal execution distance
for entry in reconEntries:
if entry[1] == hostID and entry[2] != fileName:
weight = (1.0 / (math.pow(abs(rowID -entry[0]),2))*10)
if entry[3] < firstRun:
rowList.append(tuple((int(entry[0]), 1, 0, weight)))
else:
rowList.append(tuple((int(entry[0]), 0, 1, weight)))
DB.ExecuteMany("INSERT INTO " + collateralDBTableName + " VALUES (NULL,?, ?, ?, ?, 0)", rowList)
def distance_to(self, other):
'''
Length-adjusted Levenshtein "distance" to other OTU
other: OTU
distance to this OTU
returns: float
'''
return Levenshtein.distance(self.sequence, other.sequence) / (0.5 * (len(self.sequence) + len(other.sequence)))
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
'''
seq_table: pandas.DataFrame
Samples on the columns; sequences on the rows
records: index of Bio.Seq
Indexed, unaligned input sequences. This could come from BioPython's
SeqIO.to_dict or SeqIO.index.
max_dist: float
genetic distance cutoff above which a sequence will not be merged into an OTU
min_fold: float
Multiply the sequence's abundance by this fold to get the minimum abundance
of an OTU for merging
threshold_pval: float
P-value below which a sequence will not be merged into an OTU
log: filehandle
Log file reporting the abundance, genetic, and distribution checks.
'''
self.seq_table = seq_table
self.records = records
self.max_dist = max_dist
self.min_fold = min_fold
self.threshold_pval = threshold_pval
self.log = log
# get a list of the names of the sequences in order of their (decreasing) abundance
self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)
# check that all sequence IDs in the table are in the fasta
missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
if len(missing_ids) > 0:
raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))
# initialize OTU information
self.membership = {}
self.otus = []
def ga_matches(self, candidate):
'''
OTUs that meet the genetic and abundance criteria
candidate: OTU
sequence to evaluate
'''
# find abundance matches
min_abundance = self.min_fold * candidate.abundance
abundance_matches = [otu for otu in self.otus if otu.abundance > min_abundance]
if self.log is not None:
print(candidate.name, 'abundance_check', *[otu.name for otu in abundance_matches], sep='\t', file=self.log)
if len(abundance_matches) == 0:
return []
else:
# find genetic matches (in order of decreasing genetic distance)
matches_distances = [(otu.distance_to(candidate), otu) for otu in abundance_matches]
matches_distances.sort(key=lambda x: (x[0], -x[1].abundance, x[1].name))
matches = [otu for dist, otu in matches_distances if dist < self.max_dist]
if self.log is not None:
print(candidate.name, 'genetic_check', *[otu.name for otu in matches], sep='\t', file=self.log)
return matches
def call_otus(seq_table_fh, fasta_fh, output_fh, dist_crit, abund_crit, pval_crit, log=None, membership=None):
'''
Read in input files, call OTUs, and return output.
seq_table_fh: filehandle
sequence count table
fasta_fh: filehandle or filename
sequences fasta
output_fh: filehandle
place to write main output OTU table
dist_crit, abund_crit, pval_crit: float
threshold values for distance, abundance, and pvalue
log, membership: filehandles
places to write supplementary output
'''
# read in the sequences table
seq_table = read_sequence_table(seq_table_fh)
# set up the input fasta records
records = SeqIO.index(fasta_fh, 'fasta')
# generate the caller object
caller = DBCaller(seq_table, records, dist_crit, abund_crit, pval_crit, log)
caller.generate_otu_table()
caller.write_otu_table(output_fh)
if membership is not None:
caller.write_membership(membership)
edit_distance.py 文件源码
项目:tensorflow_end2end_speech_recognition
作者: hirofumi0810
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def compute_cer(str_pred, str_true, normalize=True):
"""Compute Character Error Rate.
Args:
str_pred (string): a sentence without spaces
str_true (string): a sentence without spaces
normalize (bool, optional): if True, divide by the length of str_true
Returns:
cer (float): Character Error Rate between str_true and str_pred
"""
cer = lev.distance(str_pred, str_true)
if normalize:
cer /= len(list(str_true))
return cer
def _execute(self, str1, str2):
LDAlgorithm._execute(self, str1, str2)
return levenshtein_distance(str1, str2)
def get_ratio(old, new):
"""Return a "similiarity ratio" (in percent) representing the similarity
between the two strings where 0 is equal and anything above less than equal.
"""
if not all([old, new]):
return VERSIONING_RATIO
if IS_SPEEDUP:
return Levenshtein.distance(old, new) / (len(old) / 100.0)
else:
return levenshtein_distance(old, new) / (len(old) / 100.0)
def testDefaultParseValueFuzz(self, value):
try:
result = parser.DefaultParseValue(value)
except TypeError:
# It's OK to get a TypeError if the string has the null character.
if u'\x00' in value:
return
raise
except MemoryError:
if len(value) > 100:
# This is not what we're testing.
return
raise
try:
uvalue = unicode(value)
uresult = unicode(result)
except UnicodeDecodeError:
# This is not what we're testing.
return
# Check that the parsed value doesn't differ too much from the input.
distance = Levenshtein.distance(uresult, uvalue)
max_distance = (
2 + # Quotes or parenthesis can be implicit.
sum(c.isspace() for c in value) +
value.count('"') + value.count("'") +
3 * (value.count(',') + 1) + # 'a,' can expand to "'a', "
3 * (value.count(':')) + # 'a:' can expand to "'a': "
2 * value.count('\\'))
if '#' in value:
max_distance += len(value) - value.index('#')
if not isinstance(result, six.string_types):
max_distance += value.count('0') # Leading 0s are stripped.
# Note: We don't check distance for dicts since item order can be changed.
if '{' not in value:
self.assertLessEqual(distance, max_distance,
(distance, max_distance, uvalue, uresult))
def closest_token(stemmed_token_lst, merchant_info):
score = 0
merchant_tokens = merchant_info.split() # only split works in merchant_info
for t in stemmed_token_lst:
min_dist = sys.maxint
for m in merchant_tokens:
tmp_dist = distance(t, m)
if min_dist > tmp_dist:
min_dist = tmp_dist
score += min_dist
return score
def closest_token(stemmed_token_lst, merchant_info):
score = 0
merchant_tokens = [stemmer.stem(m) for m in merchant_info.split()] # stem merchant tokens here
for t in stemmed_token_lst:
min_dist = sys.maxint
for m in merchant_tokens:
tmp_dist = distance(t, m)
if min_dist > tmp_dist:
min_dist = tmp_dist
score += min_dist
return score
def closest_token(stemmed_token_lst, merchant_info):
min_dist = sys.maxint # only use the min_dist for all as the score
merchant_tokens = [stemmer.stem(m) for m in merchant_info.split()] # stem merchant tokens here
for t in stemmed_token_lst:
for m in merchant_tokens:
tmp_dist = distance(t, m)
if min_dist > tmp_dist:
min_dist = tmp_dist
return min_dist
def hamming_distance(string1, string2):
"""
Computes the Hamming distance between two strings.
The Hamming distance between two strings of equal length is the number of positions at which the corresponding
symbols are different. In another way, it measures the minimum number of substitutions required to change
one string into the other, or the minimum number of errors that could have transformed one string into the other.
Args:
string1,string2 (str): Input strings
Returns:
Hamming distance (int)
Raises:
TypeError : If the inputs are not strings or if one of the inputs is None.
ValueError : If the input strings are not of same length
Examples:
>>> hamming_distance('', '')
0
>>> hamming_distance('alex', 'john')
4
>>> hamming_distance(' ', 'a')
0
>>> hamming_distance('JOHN', 'john')
4
"""
# input validations
utils.sim_check_for_none(string1, string2)
utils.tok_check_for_string_input(string1, string2)
# for Hamming Distance string length should be same
utils.sim_check_for_same_len(string1, string2)
# sum all the mismatch characters at the corresponding index of
# input strings
return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
def levenshtein(string1, string2):
"""
Computes the Levenshtein distance between two strings.
Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
is carried out using a sequence of the following operators: delete a character, insert a character, and
substitute one character for another.
Args:
string1,string2 (str): Input strings
Returns:
Levenshtein distance (int)
Raises:
TypeError : If the inputs are not strings
Examples:
>>> levenshtein('a', '')
1
>>> levenshtein('example', 'samples')
3
>>> levenshtein('levenshtein', 'frankenstein')
6
Note:
This implementation internally uses python-levenshtein package to compute the Levenshtein distance
"""
# input validations
utils.sim_check_for_none(string1, string2)
utils.sim_check_for_string_inputs(string1, string2)
# using Levenshtein library
return Levenshtein.distance(string1, string2)
count_well_duplicates.py 文件源码
项目:well_duplicates
作者: EdinburghGenomics
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def get_edit_distance(str1, str2):
return Levenshtein.distance(str1, str2)
def __get_fuzzy_string_dict(index=0, current_text='', return_text='', uid=0):
"""
Returns dictionary with index, distance, text and statement_uid as keys
:param index: int
:param current_text: string
:param return_text: string
:param uid: int
:return: dict()
"""
return {'index': index,
'distance': get_distance(current_text.lower(), return_text.lower()),
'text': return_text,
'statement_uid': uid}