python类distance()的实例源码

decoder.py 文件源码 项目:ngraph 作者: NervanaSystems 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = {ss: ii for ii, ss in enumerate(b)}

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))
versioning.py 文件源码 项目:chalktalk_docs 作者: loremIpsum1771 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def levenshtein_distance(a, b):
    """Return the Levenshtein edit distance between two strings *a* and *b*."""
    if a == b:
        return 0
    if len(a) < len(b):
        a, b = b, a
    if not a:
        return len(b)
    previous_row = range(len(b) + 1)
    for i, column1 in enumerate(a):
        current_row = [i + 1]
        for j, column2 in enumerate(b):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (column1 != column2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]
arguments.py 文件源码 项目:tpg.now 作者: stklik 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def getStopFromString(self, candidate):
        normalizedCandidate = Stop.normalizeStopName(candidate)
        if not Tpg.getTodaysStops():
            return None

        for stop in Tpg.getTodaysStops():
            if candidate.upper() == stop.code:
                return stop

            if normalizedCandidate == stop.normalizedName:
                return stop

        for stop in Tpg.getTodaysStops():
            if normalizedCandidate in stop.normalizedName:
                return stop

        # calculate the Levenshtein distance to all stop names
        codeToLevenshtein = {stop: Levenshtein.distance(
            normalizedCandidate, stop.normalizedName) for stop in Tpg.getTodaysStops()}
        # smallest Levenshtein distance
        minimum = min(codeToLevenshtein, key=codeToLevenshtein.get)
        return minimum
test_rst.py 文件源码 项目:henet 作者: AcrDijon 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def test_parse(self):
        for file in os.listdir(SAMPLE_DIR):
            if not file.endswith('.rst'):
                continue
            filename = os.path.join(SAMPLE_DIR, file)
            article = parse_article(filename)
            rendered = article.render().strip()

            with open(filename) as f:
                source = f.read().strip()
                source = source.expandtabs(4).decode('utf8')

            if source != rendered:
                lev_ = distance(source, rendered)
                jaro_ = jaro(source, rendered)

                if lev_ > 10 and jaro_ < 0.8 and file not in MUTATED_FILES:
                    print('%d %f %s' % (lev_, jaro_, filename))
                    raise AssertionError(filename)
edit_distance.py 文件源码 项目:tensorflow_end2end_speech_recognition 作者: hirofumi0810 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def compute_edit_distance(session, labels_true_st, labels_pred_st):
    """Compute edit distance per mini-batch.
    Args:
        session:
        labels_true_st: A `SparseTensor` of ground truth
        labels_pred_st: A `SparseTensor` of prediction
    Returns:
        edit_distances: list of edit distance of each uttearance
    """
    indices, values, dense_shape = labels_true_st
    labels_pred_pl = tf.SparseTensor(indices, values, dense_shape)
    indices, values, dense_shape = labels_pred_st
    labels_true_pl = tf.SparseTensor(indices, values, dense_shape)

    edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True)
    edit_distances = session.run(edit_op)

    return edit_distances
edit_distance.py 文件源码 项目:tensorflow_end2end_speech_recognition 作者: hirofumi0810 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def compute_per(ref, hyp, normalize=True):
    """Compute Phone Error Rate.
    Args:
        ref (list): phones in the reference transcript
        hyp (list): phones in the predicted transcript
        normalize (bool, optional): if True, divide by the length of str_true
    Returns:
        per (float): Phone Error Rate between str_true and str_pred
    """
    # Build mapping of phone to index
    phone_set = set(ref + hyp)
    phone2char = dict(zip(phone_set, range(len(phone_set))))

    # Map phones to a single char array
    # NOTE: Levenshtein packages only accepts strings
    phones_ref = [chr(phone2char[p]) for p in ref]
    phones_hyp = [chr(phone2char[p]) for p in hyp]

    per = lev.distance(''.join(phones_ref), ''.join(phones_hyp))
    if normalize:
        per /= len(ref)
    return per
deepSpeech_test.py 文件源码 项目:deepSpeech 作者: fordDeepDSP 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def inference(predictions_op, true_labels_op, display, sess):
    """ Perform inference per batch on pre-trained model.
    This function performs inference and computes the CER per utterance.
    Args:
        predictions_op: Prediction op
        true_labels_op: True Labels op
        display: print sample predictions if True
        sess: default session to evaluate the ops.
    Returns:
        char_err_rate: list of CER per utterance.
    """
    char_err_rate = []
    # Perform inference of batch worth of data at a time.
    [predictions, true_labels] = sess.run([predictions_op,
                                           true_labels_op])
    pred_label = sparse_to_labels(predictions[0][0])
    actual_label = sparse_to_labels(true_labels)
    for (label, pred) in zip(actual_label, pred_label):
        char_err_rate.append(distance(label, pred)/len(label))

    if display:
        # Print sample responses
        for i in range(ARGS.batch_size):
            print(actual_label[i] + ' vs ' + pred_label[i])
    return char_err_rate
matcher.py 文件源码 项目:dbas 作者: hhucn 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def get_strings_for_search(value):
    """
    Returns all statements which have a substring of the given value

    :param value: String
    :return: dict() with Statements.uid as key and 'text', 'distance' as well as 'arguments' as values
    """
    tmp_dict = OrderedDict()
    db_statements = get_not_disabled_statement_as_query().join(TextVersion, Statement.textversion_uid == TextVersion.uid).all()
    for stat in db_statements:
        if value.lower() in stat.textversions.content.lower():
            # get distance between input value and saved value
            rd = __get_fuzzy_string_dict(current_text=value, return_text=stat.textversions.content, uid=stat.uid)
            tmp_dict[str(stat.uid)] = rd

    tmp_dict = __sort_dict(tmp_dict)
    return_index = list(islice(tmp_dict, list_length))
    return_dict = OrderedDict()
    for index in return_index:
        return_dict[index] = tmp_dict[index]
    return return_dict
matcher.py 文件源码 项目:dbas 作者: hhucn 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def get_strings_for_public_nickname(value, nickname):
    """
    Returns dictionaries with public nicknames of users, where the nickname containts the value

    :param value: String
    :param nickname: current users nickname
    :return: dict()
    """
    db_user = DBDiscussionSession.query(User).filter(func.lower(User.public_nickname).contains(func.lower(value)),
                                                     ~User.public_nickname.in_([nickname, 'admin', nick_of_anonymous_user])).all()
    return_array = []

    for index, user in enumerate(db_user):
        dist = get_distance(value, user.public_nickname)
        return_array.append({'index': index,
                             'distance': dist,
                             'text': user.public_nickname,
                             'avatar': get_public_profile_picture(user)})

    return_array = __sort_array(return_array)
    return return_array[:list_length]
matcher.py 文件源码 项目:dbas 作者: hhucn 项目源码 文件源码 阅读 62 收藏 0 点赞 0 评论 0
def __sort_array(list):
    """
    Returns sorted array, based on the distance

    :param list: Array
    :return: Array
    """
    return_list = []
    newlist = sorted(list, key=lambda k: k['distance'])

    if mechanism == 'SequenceMatcher':  # sort descending
        newlist = reversed(newlist)

    # add index
    for index, dict in enumerate(newlist):
        dict['index'] = index
        return_list.append(dict)

    return return_list
matcher.py 文件源码 项目:dbas 作者: hhucn 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def __sort_dict(dictionary):
    """
    Returns sorted dictionary, based on the distance

    :param dictionary: dict()
    :return: dict()
    """
    dictionary = OrderedDict(sorted(dictionary.items()))
    return_dict = OrderedDict()
    for i in list(dictionary.keys())[0:return_count]:
        return_dict[i] = dictionary[i]
    if mechanism == 'SequenceMatcher':  # sort descending
        return_dict = OrderedDict(sorted(dictionary.items(), key=lambda kv: kv[0], reverse=True))
    else:  # sort ascending
        return_dict = OrderedDict()
        for i in list(dictionary.keys())[0:return_count]:
            return_dict[i] = dictionary[i]
    return return_dict
MachineUtils.py 文件源码 项目:ph0neutria 作者: phage-nz 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def getSignificantItems(item_list):
    tokenised_list = []

    logging.info('Tokenising input data.')
    for item in item_list:
        tokenised_list.append(tokeniseUrl(item))

    items = np.asarray(item_list)
    tokenised_items = np.asarray(tokenised_list)
    logging.info('Calculating Levenshtein distances between items.')
    lev_similarity = -1*np.array([[Levenshtein.distance(i1,i2) for i1 in tokenised_items] for i2 in tokenised_items])

    logging.info('Applying affinity propagation to data.')
    aff_prop = sklearn.cluster.AffinityPropagation(affinity='precomputed', damping=0.7)
    aff_prop.fit(lev_similarity)

    logging.info('Completed! Assembling list.')
    output_list = []

    for cluster_id in np.unique(aff_prop.labels_):
        exemplar = items[aff_prop.cluster_centers_indices_[cluster_id]]
        output_list.append(exemplar)

    return output_list
decoder.py 文件源码 项目:deepspeech.pytorch 作者: SeanNaren 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))
merge_ocr_subtitle.py 文件源码 项目:video_subtitle_extract 作者: thewintersun 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def maybe_same(str1,str2):
  '''??2????????????'''
  if len(str1) > len(str2):
    temp = str1
    str1 = str2
    str2 = temp

  #??????????????
  if float(len(str2))/ len(str1) > 2 and len(str1)>=4:
    return False

  #????????2, ?????
  distance = Levenshtein.distance(str1,str2)
  if distance <= 3 and len(str1)>=10:
    return True
  if distance <= 4 and len(str1)>=13:
    return True
  if distance <= 1 and len(str1)>=5:
    return True
  if distance > 2 and len(str1)<=6:
    return False
  if distance > 3:
    return False

  return True
knnClassifier.py 文件源码 项目:flexmatcher 作者: biggorilla-gh 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def find_knn(self, train_strings, train_labels, test_strings):
        """Find 3 nearest neighbors of each item in test_strings in
        train_strings and report their labels as the prediction.

        Args:
            train_strings (ndarray): Numpy array with strings in training set
            train_labels (ndarray): Numpy array with labels of train_strings
            test_strings (ndarray): Numpy array with string to be predict for
        """
        prediction = np.zeros((len(test_strings), self.num_classes))
        for i in range(len(test_strings)):
            a_str = test_strings[i]
            dists = np.array([0] * len(train_strings))
            for j in range(len(train_strings)):
                b_str = train_strings[j]
                dists[j] = lev.distance(a_str, b_str)
            # finding the top 3
            top3 = dists.argsort()[:3]
            for ind in top3:
                prediction[i][self.column_index[train_labels[ind]]] += 1.0 / 3
        return prediction
pygi.py 文件源码 项目:pygi 作者: onlined 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def gitignores(*args):
    to_send = []
    gitignore_list = list()
    for arg in set(args):
        if arg in gitignore_list:
            to_send.append(arg)
        elif __name__ == '__main__':
            possibles = []
            for gitignore in gitignore_list:
                if Levenshtein.distance(gitignore, arg) == 1:
                    possibles.append(gitignore)
            print('WARNING: {} is not in gitignore list.'.format(arg), file=sys.stderr, end='')
            if possibles:
                if len(possibles) == 1:
                    possible_string = possibles[0]
                else:
                    possible_string = ', '.join(possibles[:-1]) + ' or ' + possibles[-1]
                print(' Did you mean {}?'.format(possible_string), file=sys.stderr)
            else:
                print('', file=sys.stderr)
    if not to_send:
        return '\n'
    text = _get_text_from_url('{}/{}'.format(API_URL, ','.join(to_send)))
    return '\n'.join(text.split('\n')[2:])
postprocessing.py 文件源码 项目:deep-web-hackathon 作者: ogigoc 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def prune_useless_elements(path_root):
    to_remove = []
    for c in path_root.children:
        for useless in USELESS_KEYWORDS:
            if c.id and (distance(c.id, useless) <= MAX_DISTANCE or useless in c.id):
                #print('Removing {0} because of id {1}'.format(c, useless))
                to_remove.append(c)

            if c.cls:
                for cl in c.cls:
                    if distance(cl, useless) <= MAX_DISTANCE or useless in cl:
                        #print('Removing {0} because of class name {1}'.format(c, cl))
                        to_remove.append(c)

    path_root.children = [c for c in path_root.children if c not in to_remove]
    for c in path_root.children:
        prune_useless_elements(c)
BotDigger.py 文件源码 项目:BotDigger 作者: hanzhang0116 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def distanceDomain(domain, DomainDict, ccTldDict, tldDict):
    similarDomain = ""
    minDistance = sys.maxint
    level = domain.split(".")
    if len(level) <=1:
        return ("not a domain", sys.maxint)
    (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict)
    for popularDomain in DomainDict:
        distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8'))
        if distance < minDistance:
            minDistance = distance
            similarDomain = popularDomain
    #debug
    #sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance))
    if len(similarDomain) > 0:
        return (similarDomain, minDistance/float(len(similarDomain)))
    else:
        return (domain2LD, 0)

# check whether a domain contains invalid TLD
phone_decoder.py 文件源码 项目:make_dataset 作者: hyzhan 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        s1 = s1.replace(' ','')
        s2 = s2.replace(' ','')
        b = set(s1.split('<space>') + s2.split('<space>'))
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))
word_decoder.py 文件源码 项目:make_dataset 作者: hyzhan 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))
phone_decoder.py 文件源码 项目:make_dataset 作者: hyzhan 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        s1 = s1.replace(' ','')
        s2 = s2.replace(' ','')
        b = set(s1.split('<space>') + s2.split('<space>'))
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))
word_decoder.py 文件源码 项目:make_dataset 作者: hyzhan 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))
phone_decoder.py 文件源码 项目:make_dataset 作者: hyzhan 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        s1 = s1.replace(' ','')
        s2 = s2.replace(' ','')
        b = set(s1.split('<space>') + s2.split('<space>'))
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))
word_decoder.py 文件源码 项目:make_dataset 作者: hyzhan 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))
StringMatcher.py 文件源码 项目:health-mosconi 作者: GNUHealth-Mosconi 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def distance(self):
        if not self._distance:
            self._distance = distance(self._str1, self._str2)
        return self._distance
levenshtein.py 文件源码 项目:arisu 作者: Appleman1234 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def levenshtein(self, msg, args):
        """Calculate levenshtein distance between two words"""
        if len(args) == 2:
            result = "Levenshtein distance: " + str(pylev.distance(args[0],args[1]))
        else:
            result = "Two words are needed to calculate Levenshtein distance"
        return result
elasticsearch.py 文件源码 项目:zing 作者: evernote 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def filter_hits_by_distance(hits, source_text,
                            min_similarity=DEFAULT_MIN_SIMILARITY):
    """Returns ES `hits` filtered according to their Levenshtein distance
    to the `source_text`.

    Any hits with a similarity value (0..1) lower than `min_similarity` will be
    discarded. It's assumed that `hits` is already sorted from higher to lower
    score.
    """
    if min_similarity <= 0 or min_similarity >= 1:
        min_similarity = DEFAULT_MIN_SIMILARITY

    filtered_hits = []
    for hit in hits:
        hit_source_text = hit['_source']['source']
        distance = Levenshtein.distance(source_text, hit_source_text)
        similarity = (
            1 - distance / float(max(len(source_text), len(hit_source_text)))
        )

        logger.debug(
            'Similarity: %.2f (distance: %d)\nOriginal:\t%s\nComparing with:\t%s',
            similarity, distance, source_text, hit_source_text
        )

        if similarity < min_similarity:
            break

        filtered_hits.append(hit)

    return filtered_hits
weather_condition.py 文件源码 项目:snips-skill-owm 作者: snipsco 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def fuzzy_match(self, locale, condition_name):
        condition_name = self.normalize_input(condition_name)
        conditions_candidates = self.get_condition_candidates(locale, condition_name)

        sorted_candidates = sorted(conditions_candidates.items(),
                                   cmp=lambda x, y: Levenshtein.distance(condition_name, x[1]) - Levenshtein.distance(
                                       condition_name, y[1]))
        return sorted_candidates[0][0]
weather_condition.py 文件源码 项目:snips-skill-owm 作者: snipsco 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_condition_candidates(self, locale, condition_name):
        return {condition: min(mappings[condition][locale], key=lambda s: Levenshtein.distance(condition_name, s)) for
                condition in list(SnipsWeatherConditions)}
matchName.py 文件源码 项目:pyfeld 作者: scjurgen 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def match_something(item, list):
    item = item.replace(" ","")
    item = item.replace(".", "")
    item = item.replace(",", "")
    lowest = list[0]
    lowestdelta = Levenshtein.distance(item, list[0])
    for entry in list:
        delta = Levenshtein.distance(item, entry)
        if delta < lowestdelta:
            lowestdelta = delta
            lowest = entry

    print(delta, item, entry)
    return lowest


问题


面经


文章

微信
公众号

扫码关注公众号