python类ratio()的实例源码-面圈网

__init__.py 文件源码项目：dupandas 作者: shivam5992 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def match_elements(self, text1, text2):
        """
        utility function to match two strings, makes use of 
        match config initiated in __init__ 

        returns the output as confidence score of flexible match
        """

        conf = 0
        if self.m_config['exact']:
            if text1 == text2:
                conf += 1

        if self.m_config['levenshtein']:
            conf += ratio(text1, text2)

        if self.m_config['soundex']:
            if soundex(text1) == soundex(text2):
                conf += 1

        if self.m_config['nysiis']:
            if fuzzy.nysiis(text1) == fuzzy.nysiis(text2):
                conf += 1

        return conf

MutiKB.py 文件源码项目：entity-linker 作者: seucs 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def getVec(kb, id1, id2):
    if kb == 'bh':
        title1, context1, category1 = getMsgbyId('baidu', id1)
        title2, context2, category2 = getMsgbyId('hudong', id2)
    if kb == 'bw':
        title1, context1, category1 = getMsgbyId('baidu', id1)
        title2, context2, category2 = getMsgbyId('wiki', id2)
    if kb == 'hw':
        title1, context1, category1 = getMsgbyId('hudong', id1)
        title2, context2, category2 = getMsgbyId('wiki', id2)

    title_r = Levenshtein.ratio(title1, title2)
    context_r = cosine(context1, context2)
    category_r = sameCategory(category1, category2)

    return (title_r, context_r, category_r, 0.0)

speech_eval.py 文件源码项目：lorelei-speech-evaluation 作者: usc-sail 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def frame_similarity(frame1,frame2):
    similarity = 1
    if 'Type' in frame1:
        if frame1['Type'] != frame2['Type']:
            similarity = 0.0
    if similarity == 1:
        if 'PlaceMention' in frame1:
            # if PlaceMention is normalized use simple string comparison
            if not Levenshtein_arg:
                if frame1['PlaceMention']  != frame2['PlaceMention']:
                    similarity = 0.0
            else:
                # PlaceMention is not normalized so use Levinshtein distance
                similarity = Levenshtein.ratio(frame1['PlaceMention'], frame2['PlaceMention'])
    #print("similarity: ", similarity)
    return similarity


# evaluate at the document level -----------------------------------------------

status_notifications.py 文件源码项目：intake 作者: codeforamerica 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_message_change_ratio(status_update):
    """Expects a status update instance, returns a number representing
    how much a message has been edited (1.0 completely changed, 0.0 unchanged)
    based on Levenshtein ratio.
    If a status update has no associated notification, returns None
    https://github.com/ztane/python-Levenshtein
    """
    if hasattr(status_update, 'notification'):
        author_profile = status_update.author.profile
        intro_text = get_notification_intro(author_profile) + '\n\n'
        return 1.0 - Levenshtein.ratio(
            *[message.replace(intro_text, '')
              for message in (
                status_update.notification.base_message,
                status_update.notification.sent_message)])
    else:
        return None

my_system.py 文件源码项目：ELBaselines 作者: cltl 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def getCandidatesForLemma(lemma, min_size, max_size):
    hits=[]
    for match in ["phrase", "conjunct"]:
        url="http://lotus.lodlaundromat.org/retrieve?size=" + str(max_size) + "&match=" + match + "&rank=psf&noblank=true&" + urllib.parse.urlencode({"string": lemma, "predicate": "label", "subject": "\"http://dbpedia.org/resource\""})
        r = requests.get(url=url)
        content = r.json()

        these_hits=content["hits"]
        hits=hits + these_hits
        if content["numhits"]>=min_size or len(lemma.split(' '))==1:
            break

    subjects={}
    for hit in hits:
        lev_sim=Levenshtein.ratio(hit["string"].lower(), lemma.lower())
        if "Disambiguation" not in hit["subject"].lower() and "Category" not in hit["subject"]:
            if hit["subject"] not in subjects:
                #subjects[hit["subject"]]=hit["length"]*len(lemma.split())
                subjects[hit["subject"]]={"ss": lev_sim, "count": 1}
            else:
                subjects[hit["subject"]]["ss"]=max(subjects[hit["subject"]]["ss"], lev_sim)
                subjects[hit["subject"]]["count"]+=1
    return subjects

api.py 文件源码项目：iqra-api 作者: Crescent-Labs 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def mostCommon(spoken, lst, threshold):
    highestCountItem = max(lst, key=lst.count)
    highestCount = lst.count(highestCountItem)
    contenders = []
    for item in lst:
        if (lst.count(item) == highestCount) and (item not in contenders):
            contenders.append(item)
    if len(contenders) > 1:
        print "\nContending"
        bestMatch = [None, 0]
        for ayah in contenders:
            score = ratio(spoken, ayah)
            print ayah
            print score
            if score > threshold and score > bestMatch[1]:
                bestMatch = [ayah, score]
        return bestMatch[0]
    elif ratio(spoken, highestCountItem) > threshold:
        return highestCountItem
    else:
        return None


# Takes in a query and list of matches
# Returns the match with the highest similarity to the query

reverb_measure_distance.py 文件源码项目：Hanhan_NLP 作者: hanhanwu 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def print_matched_groups(extracted_combo_lst):
    dst_dct = {}

    for itm in extracted_combo_lst:
        dst_dct.setdefault(itm, [])
        if len(extracted_combo_lst) == 1: break

        match_dct = {}
        for i in range(len(extracted_combo_lst)):
            if extracted_combo_lst[i] == itm: continue
            dst = Levenshtein.ratio(itm, extracted_combo_lst[i])
            match_dct[extracted_combo_lst[i]] = dst

        sorted_match_lst = sorted(match_dct.items(), key = operator.itemgetter(1), reverse = True)
        top_n = 2
        dst_dct[itm] = [e[0] for e in sorted_match_lst[0:top_n]]
        extracted_combo_lst.remove(itm)
        for e in dst_dct[itm]:
            extracted_combo_lst.remove(e)

    for k, v in dst_dct.items():
        print k, v
        print

plugin.py 文件源码项目：phat 作者: danielfranca 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def should_run(self):
        data = self.item_options.get('compare_url')

        if data:
            if isinstance(data, Dict):
                self.fuzzy = data.get('fuzzy', 1.0)
                self.url2 = data.get('url')
                if not self.url2:
                    logger.debug('compare_url must contain a url')
                    return False
            else:
                logger.debug('compare_url must be a nested dictionary containing url and ratio properties')
                return False

            return True

        return False

StringMatcher.py 文件源码项目：health-mosconi 作者: GNUHealth-Mosconi 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def ratio(self):
        if not self._ratio:
            self._ratio = ratio(self._str1, self._str2)
        return self._ratio

StringMatcher.py 文件源码项目：health-mosconi 作者: GNUHealth-Mosconi 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def quick_ratio(self):
        # This is usually quick enough :o)
        if not self._ratio:
            self._ratio = ratio(self._str1, self._str2)
        return self._ratio

db.py 文件源码项目：entity-linker 作者: seucs 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def getCandidates(self, mention, threshold=0.7):
        res = []

        # ??title?
        for id, title, link_count in self.db_titles:
            m_score = Levenshtein.ratio(title, mention)
            if m_score > threshold:
                self.cur.execute("select abstract from abstract where id = %s"%id)
                context = self.cur.fetchall()
                if context != ():
                    context = json.loads(context[0][0])

                RE = []
                self.cur.execute("select to_id from link where from_id = %s"%id)
                linkto_ids = self.cur.fetchall()
                if linkto_ids != ():
                    for to_id in linkto_ids:
                        RE.append(to_id[0])

                res.append(Entity(title, id, m_score, context, link_count, RE))

        # ??disambiguation?
        for id, title, dis_context, link_count in self.db_disambiguations:
            m_score = Levenshtein.ratio(title, mention)
            if m_score > threshold:
                title += '[%s]'%dis_context
                self.cur.execute("select abstract from abstract where id = %s"%id)
                context = self.cur.fetchall()
                if context != ():
                    context = json.loads(context[0][0])
                    context.append(dis_context)

                RE = []
                self.cur.execute("select to_id from link where from_id = %s"%id)
                linkto_ids = self.cur.fetchall()
                if linkto_ids != ():
                    for to_id in linkto_ids:
                        RE.append(to_id[0])

                res.append(Entity(title, id, m_score, context, link_count, RE))
        return res

db.py 文件源码项目：entity-linker 作者: seucs 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def getCandidates(self, mention, threshold=0.7):
        res = []

        # ??title?
        for id, title, link_count in self.db_titles:
            m_score = Levenshtein.ratio(title, mention)
            if m_score > threshold:
                self.cur.execute("select abstract from abstract where id = %s"%id)
                context = self.cur.fetchall()
                if context != ():
                    context = json.loads(context[0][0])

                RE = []
                self.cur.execute("select to_id from link where from_id = %s"%id)
                linkto_ids = self.cur.fetchall()
                if linkto_ids != ():
                    for to_id in linkto_ids:
                        RE.append(to_id[0])

                res.append(Entity(title, id, m_score, context, link_count, RE))

        # ??disambiguation?
        for id, title, dis_context, link_count in self.db_disambiguations:
            m_score = Levenshtein.ratio(title, mention)
            if m_score > threshold:
                title += '[%s]'%dis_context
                self.cur.execute("select abstract from abstract where id = %s"%id)
                context = self.cur.fetchall()
                if context != ():
                    context = json.loads(context[0][0])
                    context.append(dis_context)

                RE = []
                self.cur.execute("select to_id from link where from_id = %s"%id)
                linkto_ids = self.cur.fetchall()
                if linkto_ids != ():
                    for to_id in linkto_ids:
                        RE.append(to_id[0])

                res.append(Entity(title, id, m_score, context, link_count, RE))
        return res

speech_eval_old.py 文件源码项目：lorelei-speech-evaluation 作者: usc-sail 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def frame_similarity(frame1,frame2):
    similarity = 1
    if 'Type' in frame1:
        if frame1['Type'] != frame2['Type']:
            similarity = 0
    if similarity == 1:
        if 'PlaceMention' in frame1:
            similarity = Levenshtein.ratio(frame1['PlaceMention'], frame2['PlaceMention'])
    return similarity


# evaluate at the document level -----------------------------------------------

submissions.py 文件源码项目：intake 作者: codeforamerica 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def get_name_similarity_ratio(a, b):
    names = (get_full_lowercase_name(sub) for sub in (a, b))
    return Levenshtein.ratio(*names)

party.py 文件源码项目：relocaliser 作者: very-scary-scenario 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def interesting_party(*a, **k):
    while True:
        while True:
            phrase = get_name()
            if len(phrase) < 100:
                break

        steps = party(phrase, *a, **k)
        result = steps[-1][-1]

        if ratio(phrase.lower(), result.lower()) < 0.7:
            return steps

game.py 文件源码项目：relocaliser 作者: very-scary-scenario 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def play(self, guess):
        return ratio(normalise(guess), normalise(self.original))

my_system.py 文件源码项目：ELBaselines 作者: cltl 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def moreLocalCandidates(m, previous, candidates):
    for pm, pl in previous.items():
        if is_abbrev(m, pm):
            for prevLink in previous[pm]:
                prevLinkDB=utils.makeDbpedia(prevLink)
                candidates.append(tuple([prevLinkDB, {"ss": 1.0, "count": 0.0}]))
        elif isEnoughSubset(m, pm):
            for prevLink in previous[pm]:
                prevLinkDB=utils.makeDbpedia(prevLink)
                candidates.append(tuple([prevLinkDB, {"ss": Levenshtein.ratio(m.lower(), pm.lower()), "count": 0.0}]))
    return candidates

api.py 文件源码项目：iqra-api 作者: Crescent-Labs 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def bestLevMatch(spoken, lst):
    print " "
    bestMatch = [None, 0.65]
    for ayah in lst:
        score = ratio(spoken, ayah)
        print ayah
        print score
        if score > bestMatch[1]:
            bestMatch = [ayah, score]
    return bestMatch[0]


# Takes in an ayah object from alfanous
# Returns a cleaned-up ayah object

api.py 文件源码项目：iqra-api 作者: Crescent-Labs 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def checkForWordInQuran(value):
    wordMatch = dbGet(models.QuranWord, value)
    if wordMatch:
        return wordMatch.text
    else:
        # The original word is not in the Quran so we try alfanous' suggestions
        wordSuggestionList = []
        wordSuggestions = alfanous.do({
            "action": "suggest", "query": value
        })["suggest"]
        for word in wordSuggestions:
            for suggestion in wordSuggestions[word]:
                wordMatch = dbGet(models.QuranWord, value)
                if wordMatch:
                    wordSuggestionList.append(wordMatch.text)
        if len(wordSuggestionList) > 1:
            topRatioValue = 0
            topSuggestion = ""
            while len(wordSuggestionList) > 0:
                suggestion = wordSuggestionList.pop(0)
                suggestionRatio = ratio(value, suggestion)
                if suggestionRatio > topRatioValue:
                    topRatioValue = suggestionRatio
                    topSuggestion = suggestion
            return topSuggestion
        elif len(wordSuggestionList) == 1:
            return wordSuggestionList[0]
        else:
            return None


# Takes in a query and checks if any part of it is in the Quran
# Return the part in the Quran if one is found, otherwise it returns None

predicates_computer.py 文件源码项目：lang2program 作者: kelvinguu 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def similarity_ratio(x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD):
    """Compute the similarity ratio between two strings.
    If the ratio exceeds the threshold, return it; otherwise, return 0.

    The similarity ratio is given by
        1 - (levenshtein distance with substitution cost = 2) / (total length)
    """
    ratio = Levenshtein.ratio(x, y)
    return ratio if ratio > threshold else 0.


################################
# NERValueGenerator

messageFunctions.py 文件源码项目：hipfrog 作者: wardweistra 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def getLevenshteinDistance(item, keyword):
    item = item.lower().replace(' ', '').replace('-', '').replace('_', '')
    keyword = keyword.lower().replace(' ', '').replace('-', '').replace('_', '')
    return Levenshtein.ratio(item, keyword)

preprocessingSQ.py 文件源码项目：nlpcc2016 作者: huangxiangzhou 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def generateStemmingDict(inputPath = 'stemmer.txt', outputPath = 'stemmingDict'):
    inputEncoding = 'utf8'
    outputEncoding = 'utf8'

    distance = Levenshtein.ratio

    fi = open(inputPath, 'r', encoding=inputEncoding)
    fo = open(outputPath, 'w', encoding=outputEncoding)

    stemmingDict = {}

    for line in fi:
        if line.strip() == '':
            continue
        tmpList = line.strip().split(' => ')
        for word in tmpList[0].split(', '):
            if word not in stemmingDict:
                stemmingDict[word] = set()
            stemmingDict[word].add(tmpList[1])


    for key in stemmingDict:
        stemmingDict[key] = list(stemmingDict[key])
        for i in range(len(stemmingDict[key])):
            stemmingDict[key][i] = [stemmingDict[key][i],distance(stemmingDict[key][i],key)]


    json.dump(stemmingDict,fo)

    fi.close()
    fo.close()


    fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding)

    for key in stemmingDict:
        fotxt.write(key + ' ' + str(stemmingDict[key]) + '\n')

    fotxt.close()

w2v_distance.py 文件源码项目：Quora-Kaggle 作者: PPshrimpGo 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
    print('nones')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
    #df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    #df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
    df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_nones')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
    df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
    df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
    df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
    del df_features['question1_w2v']
    del df_features['question2_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features

edit_distacne.py 文件源码项目：Quora-Kaggle 作者: PPshrimpGo 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def get_features(df_features):
    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_w2v'] = df_features.apply(lambda x: w2v_cos_sim(x['question1'], x['question2']), axis=1)
    return df_features

registrars.py 文件源码项目：whois 作者: wavenator 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_registrar(
        cls,
        subject,
    ):
        cls.check_and_update_registrars()

        edited_subject = re.sub(
            pattern='[^\d\w]',
            repl='',
            string=subject,
        )
        edited_subject = edited_subject.lower()

        for registrar in cls.registrars:
            if edited_subject in registrar['edited'].lower():
                return registrar['original']

        most_close_registrar = ''
        most_close_registrar_distance_ratio = 0
        for registrar in cls.registrars:
            registrar_distance_ratio = Levenshtein.ratio(
                edited_subject,
                registrar['edited'],
            )
            if registrar_distance_ratio > most_close_registrar_distance_ratio:
                most_close_registrar = registrar['original']
                most_close_registrar_distance_ratio = registrar_distance_ratio

        return most_close_registrar

hu2004.py 文件源码项目：opminreplicability 作者: epochx 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def adjective_fuzzy_matching(token, adjectives, match):
    """
    Given a token and a list of terms to match, returns True if
    the stem of the token matches any of the items in the list.
    Input:
        token: Token object to match
        adjectives: list of items to match the Token
        match: minimum ratio (0-100) for matching
    """
    for adjective in adjectives:
        if Levenshtein.ratio(str(token.stem), str(adjective)) >= match:
            return True
    return False

hu2004.py 文件源码项目：opminreplicability 作者: epochx 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def _transactions_fuzzy_matching(transactions, match):
    """
    Runs fuzzy matching on the transactions, by applying a complete linkage
    hierarchical clustering algorithm to the set of different itemsets in the
    transactions. For clustering, the similarity ratio as given by
    fuzzywuzzy.ratio is used as the distance measure
    Input:
        transactions: list of tuples representing items on each transaction
        match: minimum similarity ratio (0 to 100) for clustering
    Output:
        transactions: new version of the transactions, where each item has been
                      replaced by the first item on its corresponding cluster
        word_clusters: dictionary that maps the cluster for each item
        in the transactions
    """
    words = set([])
    for transaction in transactions:
        words |= set(transaction)
    words = sorted(words)
    l = [((a, b), 100-Levenshtein.ratio(str(a), str(b)))
         for a, b in combinations(words, 2)]
    d = [value for pair, value in l]
    r = linkage(d, 'complete')
    clusters_index = fcluster(r, 100-match, "distance")
    clusters = {}
    for obs_i, cluster_i in enumerate(clusters_index):
        if cluster_i in clusters:
            clusters[cluster_i].append(words[obs_i])
        else:
            clusters[cluster_i] = [words[obs_i]]

    word_clusters = {word: clusters[clusters_index[i]]
                     for i, word in enumerate(words)}
    new_transactions = []
    for transaction in transactions:
        new_transaction = tuple(set(([word_clusters[word][0]
                                      for word in transaction])))
        new_transactions.append(new_transaction)
    return new_transactions, word_clusters

qiu2011.py 文件源码项目：opminreplicability 作者: epochx 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def __init__(self, match=90, key=lambda x: x.string.lower()):
        """
        Fuzzy matching between the given token and term objects. For comparison
        applies the function given in the "key" parameter to the Token/tuple
        of Tokens. Parameter match defines the minimum similarity ratio for
        a match when comparing.

        Input:
            match : minimum similarity for fuzzy matching (%)
            key   : function to apply to the token,
                    default=lambda x: x.string.lower()
        """
        self.match = match
        self.key = key

qiu2011.py 文件源码项目：opminreplicability 作者: epochx 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __call__(self, token_tuple, terms):
        """
        Input:
            token_tuple : Token or tuple of Token objects
            terms       : term or iterable of terms to match

        Output:
            Returns None if no match is found.
            Returns the first matched in case many of them show the same
            similarity ratio.
        """
        if not hasattr(terms, '__iter__'):
            terms = [terms]
        if not isinstance(token_tuple, tuple):
            token_tuple = (token_tuple,)
        try:
            token_tuple = tuple(self.key(token) for token in token_tuple)
        except Exception:  # as e
            token_tuple = tuple(str(token) for token in token_tuple)

        best_term = None
        best_ratio = 0

        for term in terms:
            ratio = max([Levenshtein.ratio(unicode(" ".join(token_tuple)),
                                           unicode(" ".join(term_i)))*100
                         for term_i in term])
            if ratio >= self.match and ratio > best_ratio:
                best_term = term
                best_ratio = ratio

        return best_term


# ------- UTIL FUNCTIONS ------------------------------------------------------

dist_utils.py 文件源码项目：Kaggle_HomeDepot 作者: ChenglongChen 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d