python类distance()的实例源码

decoder.py 文件源码 项目:ngraph 作者: NervanaSystems 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def cer(self, s1, s2):
        """
        Computes the Character Error Rate, defined as the edit distance.

        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """
        return Lev.distance(s1, s2)
identify.py 文件源码 项目:Library-Identification 作者: Riscure 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def compare_strings_concat_levenshtein(sample, ref):
    """
    Concatenates all strings from `sample` into one, and all strings
    from `ref` into another. They are then compared by their Levenshtein distance.
    This results in a fuzzy comparison: it detects changes within strings and
    within the list of strings.
    """
    if hasattr(ref, 'strs') and ref.strs is not None:
        i = 0
        ratios = 0
        for section in ref.strs:
            if section not in sample.strs:
                continue

            strs_a_concat = ''.join(sample.strs[section])
            strs_b_concat = ''.join(ref.strs[section])

            if len(strs_a_concat) == 0 or len(strs_b_concat) == 0:
                continue

            # Similarity meassurement from
            # Gheorghescu, M. (2005). An Automated Virus Classification System.
            # Virus Bulletin Conference, (October), 294-300.
            # (although they use it on a list of basic blocks instead of a
            # character string)

            ratio_sec = 1 - (Levenshtein.distance(strs_a_concat, strs_b_concat)
                            / float(max(len(strs_a_concat), len(strs_b_concat))))

            ratios += ratio_sec
            i += 1

        ratio = ratios / i if i > 0 else 0.0
    else:
        ratio = 0.0

    return (ratio * 100, ref.name, ref.version)
identify.py 文件源码 项目:Library-Identification 作者: Riscure 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def compare_cc_list_levenshtein(sample, ref):
    """
    Compares the cyclomatic complexity values of all functions in `sample`
    with those of all functions in `ref`, by taking the Levenshtein distance
    between these lists. This detects added/removed functions and functions
    that have changed in complexity between a sample and a reference.
    """
    if hasattr(ref, 'cclist') and ref.cclist is not None:
        ratio = 1 - (editdistance.eval(sample.cclist, ref.cclist)
                    / float(max(len(sample.cclist), len(ref.cclist))))
    else:
        ratio = 0.0

    return (ratio * 100, ref.name, ref.version)
anavec.py 文件源码 项目:anavec 作者: proycon 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def setup_argparser(parser):
    parser.add_argument('-m','--patternmodel', type=str,help="Pattern model of a background corpus (training data; Colibri Core unindexed patternmodel)", action='store',required=True)
    parser.add_argument('-l','--lexicon', type=str,help="Lexicon file (training data; plain text, one word per line)", action='store',required=False)
    parser.add_argument('-L','--lm', type=str,help="Language model file in ARPA format", action='store',required=False)
    parser.add_argument('-c','--classfile', type=str,help="Class file of background corpus", action='store',required=True)
    parser.add_argument('-k','--neighbours','--neighbors', type=int,help="Maximum number of anagram distances to consider (the actual amount of anagrams is likely higher)", action='store',default=3, required=False)
    parser.add_argument('-K','--candidates', type=int,help="Maximum number of candidates  to consider per input token/pattern", action='store',default=100, required=False)
    parser.add_argument('-n','--topn', type=int,help="Maximum number of candidates to return", action='store',default=10,required=False)
    parser.add_argument('-N','--ngrams', type=int,help="N-grams to consider (max value of n). Ensure that your background corpus is trained for at least the same length for this to have any effect!", action='store',default=3,required=False)
    parser.add_argument('-D','--maxld', type=int,help="Maximum levenshtein distance", action='store',default=5,required=False)
    parser.add_argument('-M','--maxvd', type=int,help="Maximum vector distance", action='store',default=5,required=False)
    parser.add_argument('-t','--minfreq', type=int,help="Minimum frequency threshold (occurrence count) in background corpus", action='store',default=1,required=False)
    parser.add_argument('-a','--alphafreq', type=int,help="Minimum alphabet frequency threshold (occurrence count); characters occuring less are not considered in the anagram vectors", action='store',default=10,required=False)
    parser.add_argument('-b','--beamsize', type=int,help="Beamsize for the decoder", action='store',default=100,required=False)
    parser.add_argument('--maxdeleteratio', type=float,help="Do not allow a word to lose more than this fraction of its letters", action='store',default=0.34,required=False)
    parser.add_argument('--lexfreq', type=int,help="Artificial frequency (occurrence count) for items in the lexicon that are not in the background corpus", action='store',default=1,required=False)
    parser.add_argument('--ldweight', type=float,help="Levenshtein distance weight for candidating ranking", action='store',default=1,required=False)
    parser.add_argument('--vdweight', type=float,help="Vector distance weight for candidating ranking", action='store',default=1,required=False)
    parser.add_argument('--freqweight', type=float,help="Frequency weight for candidating ranking", action='store',default=1,required=False)
    parser.add_argument('--lexweight', type=float,help="Lexicon distance weight for candidating ranking", action='store',default=1,required=False)
    parser.add_argument('--lmweight', type=float,help="Language Model weight for Language Model selection (together with --correctionweight)", action='store',default=1,required=False)
    parser.add_argument('--correctionweight', type=float,help="Correction Model weight for Language Model selection (together with --lmweight)", action='store',default=1,required=False)
    parser.add_argument('--correctscore', type=float,help="The score a word must reach to be marked correct prior to decoding", action='store',default=0.60,required=False)
    parser.add_argument('--correctfreq', type=float,help="The frequency a word must have for it to be marked correct prior to decoding",action='store',default=200,required=False)
    parser.add_argument('--punctweight', type=int,help="Punctuation character weight for anagram vector representation", action='store',default=1,required=False)
    parser.add_argument('--unkweight', type=int,help="Unknown character weight for anagram vector representation", action='store',default=1,required=False)
    parser.add_argument('--ngramboost',type=float, help="Boost unigram candidates that are also predicted as part of larger ngrams, by the specified factor",action='store', default=0.25,required=False)
    parser.add_argument('-1','--simpledecoder',action='store_true', help="Use only unigrams in decoding")
    parser.add_argument('--lmwin',action='store_true', help="Boost the scores of the LM selection (to 1.0) just prior to output")
    parser.add_argument('--locallm',action='store_true', help="Use a local LM to select a preferred candidate in each candidate list instead of the LM integrated in the decoder")
    parser.add_argument('--blocksize',type=int, action='store', help="Block size: determines the amount of test tokens to process in one go (dimensions of the anavec test matrix), setting this helps reduce memory at the cost of speed (0 = unlimited)",default=1000)
    parser.add_argument('--report',action='store_true', help="Output a full report")
    parser.add_argument('--json',action='store_true', help="Output JSON")
    parser.add_argument('--tok',action='store_true', help="Input is already tokenized")
    parser.add_argument('--noout',dest='output',action='store_false', help="Do not output")
    parser.add_argument('-d', '--debug',action='store_true')
AppCompatProcessor.py 文件源码 项目:appcompatprocessor 作者: mbevilacqua 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def PopulateAmCacheTemporalCollaterals(fileName, sqlTweak, DB, collateralDBTableName, reconWindow=3):
    countHostsProcessed = 0
    # Process each occurrence of the FileName
    if sqlTweak is "":
        data = DB.Query("SELECT RowID, HostID, FileName, FirstRun from Entries WHERE EntryType = %s AND FileName = '%s'" % (settings.__AMCACHE__, fileName))
    else: data = DB.Query("SELECT RowID, HostID, FileName, FirstRun from Entries_FilePaths WHERE EntryType = %s AND FileName = '%s' AND %s" % (settings.__AMCACHE__, fileName, sqlTweak))

    rowList = []
    countRowsToProcess = len(data)
    countRowsProcessed = 0
    # Executed before
    for row in data:
        rowID = row[0]
        hostID = row[1]
        fileName = row[2]
        firstRun = row[3]
        # Insert entry into DB
        DB.Execute("INSERT INTO " + collateralDBTableName + " VALUES (NULL,%s, 0, 0, 0, 0)" % (rowID))

        # Check recon window
        countRowsProcessed += 1
        update_progress(float(countRowsProcessed) / float(countRowsToProcess), fileName)
        minFirstRun = firstRun - datetime.timedelta(0,60 * reconWindow)
        maxFirstRun = firstRun + datetime.timedelta(0,60 * reconWindow)
        reconEntries = DB.Query("SELECT RowID, HostID, FileName, FirstRun FROM Entries WHERE EntryType = %s AND (FirstRun >= '%s' AND FirstRun <= '%s')" % (settings.__AMCACHE__, minFirstRun, maxFirstRun))
        # Filter out incorrect correlations when RowID jumps from one host to the next
        # Weight correlation value according to temporal execution distance
        for entry in reconEntries:
            if  entry[1] == hostID and entry[2] != fileName:
                weight = (1.0 / (math.pow(abs(rowID -entry[0]),2))*10)
                if entry[3] < firstRun:
                    rowList.append(tuple((int(entry[0]), 1, 0, weight)))
                else:
                    rowList.append(tuple((int(entry[0]), 0, 1, weight)))
    DB.ExecuteMany("INSERT INTO " + collateralDBTableName + " VALUES (NULL,?, ?, ?, ?, 0)", rowList)
dbotu.py 文件源码 项目:amplicon_sequencing_pipeline 作者: thomasgurry 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def distance_to(self, other):
        '''
        Length-adjusted Levenshtein "distance" to other OTU

        other: OTU
          distance to this OTU

        returns: float
        '''
        return Levenshtein.distance(self.sequence, other.sequence) / (0.5 * (len(self.sequence) + len(other.sequence)))
dbotu.py 文件源码 项目:amplicon_sequencing_pipeline 作者: thomasgurry 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
        '''
        seq_table: pandas.DataFrame
          Samples on the columns; sequences on the rows
        records: index of Bio.Seq
          Indexed, unaligned input sequences. This could come from BioPython's
          SeqIO.to_dict or SeqIO.index.
        max_dist: float
          genetic distance cutoff above which a sequence will not be merged into an OTU
        min_fold: float
          Multiply the sequence's abundance by this fold to get the minimum abundance
          of an OTU for merging
        threshold_pval: float
          P-value below which a sequence will not be merged into an OTU
        log: filehandle
          Log file reporting the abundance, genetic, and distribution checks.
        '''
        self.seq_table = seq_table
        self.records = records
        self.max_dist = max_dist
        self.min_fold = min_fold
        self.threshold_pval = threshold_pval
        self.log = log

        # get a list of the names of the sequences in order of their (decreasing) abundance
        self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)

        # check that all sequence IDs in the table are in the fasta
        missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
        if len(missing_ids) > 0:
            raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))

        # initialize OTU information
        self.membership = {}
        self.otus = []
dbotu.py 文件源码 项目:amplicon_sequencing_pipeline 作者: thomasgurry 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def ga_matches(self, candidate):
        '''
        OTUs that meet the genetic and abundance criteria

        candidate: OTU
          sequence to evaluate
        '''

        # find abundance matches
        min_abundance = self.min_fold * candidate.abundance
        abundance_matches = [otu for otu in self.otus if otu.abundance > min_abundance]

        if self.log is not None:
            print(candidate.name, 'abundance_check', *[otu.name for otu in abundance_matches], sep='\t', file=self.log)

        if len(abundance_matches) == 0:
            return []
        else:
            # find genetic matches (in order of decreasing genetic distance)
            matches_distances = [(otu.distance_to(candidate), otu) for otu in abundance_matches]
            matches_distances.sort(key=lambda x: (x[0], -x[1].abundance, x[1].name))
            matches = [otu for dist, otu in matches_distances if dist < self.max_dist]

            if self.log is not None:
                print(candidate.name, 'genetic_check', *[otu.name for otu in matches], sep='\t', file=self.log)

            return matches
dbotu.py 文件源码 项目:amplicon_sequencing_pipeline 作者: thomasgurry 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def call_otus(seq_table_fh, fasta_fh, output_fh, dist_crit, abund_crit, pval_crit, log=None, membership=None):
    '''
    Read in input files, call OTUs, and return output.

    seq_table_fh: filehandle
      sequence count table
    fasta_fh: filehandle or filename
      sequences fasta
    output_fh: filehandle
      place to write main output OTU table
    dist_crit, abund_crit, pval_crit: float
      threshold values for distance, abundance, and pvalue
    log, membership: filehandles
      places to write supplementary output
    '''

    # read in the sequences table
    seq_table = read_sequence_table(seq_table_fh)

    # set up the input fasta records
    records = SeqIO.index(fasta_fh, 'fasta')

    # generate the caller object
    caller = DBCaller(seq_table, records, dist_crit, abund_crit, pval_crit, log)
    caller.generate_otu_table()
    caller.write_otu_table(output_fh)

    if membership is not None:
        caller.write_membership(membership)
edit_distance.py 文件源码 项目:tensorflow_end2end_speech_recognition 作者: hirofumi0810 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def compute_cer(str_pred, str_true, normalize=True):
    """Compute Character Error Rate.
    Args:
        str_pred (string): a sentence without spaces
        str_true (string): a sentence without spaces
        normalize (bool, optional): if True, divide by the length of str_true
    Returns:
        cer (float): Character Error Rate between str_true and str_pred
    """
    cer = lev.distance(str_pred, str_true)
    if normalize:
        cer /= len(list(str_true))
    return cer
distance.py 文件源码 项目:leven-squash 作者: dwcoates 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _execute(self, str1, str2):
        LDAlgorithm._execute(self, str1, str2)
        return levenshtein_distance(str1, str2)
versioning.py 文件源码 项目:chalktalk_docs 作者: loremIpsum1771 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def get_ratio(old, new):
    """Return a "similiarity ratio" (in percent) representing the similarity
    between the two strings where 0 is equal and anything above less than equal.
    """
    if not all([old, new]):
        return VERSIONING_RATIO

    if IS_SPEEDUP:
        return Levenshtein.distance(old, new) / (len(old) / 100.0)
    else:
        return levenshtein_distance(old, new) / (len(old) / 100.0)
parser_fuzz_test.py 文件源码 项目:python-fire 作者: google 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def testDefaultParseValueFuzz(self, value):
    try:
      result = parser.DefaultParseValue(value)
    except TypeError:
      # It's OK to get a TypeError if the string has the null character.
      if u'\x00' in value:
        return
      raise
    except MemoryError:
      if len(value) > 100:
        # This is not what we're testing.
        return
      raise

    try:
      uvalue = unicode(value)
      uresult = unicode(result)
    except UnicodeDecodeError:
      # This is not what we're testing.
      return

    # Check that the parsed value doesn't differ too much from the input.
    distance = Levenshtein.distance(uresult, uvalue)
    max_distance = (
        2 +  # Quotes or parenthesis can be implicit.
        sum(c.isspace() for c in value) +
        value.count('"') + value.count("'") +
        3 * (value.count(',') + 1) +  # 'a,' can expand to "'a', "
        3 * (value.count(':')) +  # 'a:' can expand to "'a': "
        2 * value.count('\\'))
    if '#' in value:
      max_distance += len(value) - value.index('#')

    if not isinstance(result, six.string_types):
      max_distance += value.count('0')  # Leading 0s are stripped.

    # Note: We don't check distance for dicts since item order can be changed.
    if '{' not in value:
      self.assertLessEqual(distance, max_distance,
                           (distance, max_distance, uvalue, uresult))
short_context_search.py 文件源码 项目:Hanhan_NLP 作者: hanhanwu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def closest_token(stemmed_token_lst, merchant_info):
  score = 0
  merchant_tokens = merchant_info.split()  # only split works in merchant_info
  for t in stemmed_token_lst:
    min_dist = sys.maxint
    for m in merchant_tokens:
      tmp_dist = distance(t, m)
      if min_dist > tmp_dist:
        min_dist = tmp_dist
    score += min_dist
  return score
short_context_search.py 文件源码 项目:Hanhan_NLP 作者: hanhanwu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def closest_token(stemmed_token_lst, merchant_info):
  score = 0
  merchant_tokens = [stemmer.stem(m) for m in merchant_info.split()]  # stem merchant tokens here
  for t in stemmed_token_lst:
    min_dist = sys.maxint
    for m in merchant_tokens:
      tmp_dist = distance(t, m)
      if min_dist > tmp_dist:
        min_dist = tmp_dist
    score += min_dist
  return score
short_context_search.py 文件源码 项目:Hanhan_NLP 作者: hanhanwu 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def closest_token(stemmed_token_lst, merchant_info):
  min_dist = sys.maxint  # only use the min_dist for all as the score
  merchant_tokens = [stemmer.stem(m) for m in merchant_info.split()]  # stem merchant tokens here
  for t in stemmed_token_lst:
    for m in merchant_tokens:
      tmp_dist = distance(t, m)
      if min_dist > tmp_dist:
        min_dist = tmp_dist
  return min_dist
simfunctions.py 文件源码 项目:py_stringmatching 作者: kvpradap 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def hamming_distance(string1, string2):
    """
    Computes the Hamming distance between two strings.

    The Hamming distance between two strings of equal length is the number of positions at which the corresponding
    symbols are different. In another way, it measures the minimum number of substitutions required to change
    one string into the other, or the minimum number of errors that could have transformed one string into the other.


    Args:
        string1,string2 (str): Input strings

    Returns:
        Hamming distance (int)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.
        ValueError : If the input strings are not of same length


    Examples:
        >>> hamming_distance('', '')
        0
        >>> hamming_distance('alex', 'john')
        4
        >>> hamming_distance(' ', 'a')
        0
        >>> hamming_distance('JOHN', 'john')
        4
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # for Hamming Distance string length should be same
    utils.sim_check_for_same_len(string1, string2)
    # sum all the mismatch characters at the corresponding index of
    # input strings
    return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
simfunctions.py 文件源码 项目:py_stringmatching 作者: kvpradap 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def levenshtein(string1, string2):
    """
    Computes the Levenshtein distance between two strings.

    Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
    is carried out using a sequence of the following operators: delete a character, insert a character, and
    substitute one character for another.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Levenshtein distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> levenshtein('a', '')
        1
        >>> levenshtein('example', 'samples')
        3
        >>> levenshtein('levenshtein', 'frankenstein')
        6


    Note:
        This implementation internally uses python-levenshtein package to compute the Levenshtein distance

    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    # using Levenshtein library
    return Levenshtein.distance(string1, string2)
count_well_duplicates.py 文件源码 项目:well_duplicates 作者: EdinburghGenomics 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_edit_distance(str1, str2):
    return Levenshtein.distance(str1, str2)
matcher.py 文件源码 项目:dbas 作者: hhucn 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __get_fuzzy_string_dict(index=0, current_text='', return_text='', uid=0):
    """
    Returns dictionary with index, distance, text and statement_uid as keys

    :param index: int
    :param current_text: string
    :param return_text: string
    :param uid: int
    :return: dict()
    """
    return {'index': index,
            'distance': get_distance(current_text.lower(), return_text.lower()),
            'text': return_text,
            'statement_uid': uid}


问题


面经


文章

微信
公众号

扫码关注公众号