python类token_set_ratio()的实例源码-面圈网

git_authorship.py 文件源码项目：saapy 作者: ashapochka 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def _compute_author_similarity(self, paired_authors):
        def row_similarity(row):
            same_email = row.author_email == row.author_email_other
            name_similarity = fuzz.token_set_ratio(row.author_name,
                                                   row.author_name_other)
            email_name_similarity = fuzz.ratio(row.email_name,
                                               row.email_name_other)
            name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
                                                            row.name_from_email_other)
            return pd.Series(
                [same_email, name_similarity, email_name_similarity,
                 name_to_email_similarity])

        newcols = paired_authors.apply(row_similarity, axis=1)
        newcols.columns = ['same_email', 'name_similarity',
                           'email_name_similarity', 'name_to_email_similarity']
        newdf = paired_authors.join(newcols)
        return newdf

nlp_feature_extraction.py 文件源码项目：kaggle-quora-dup 作者: aerdem4 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df

string_functions.py 文件源码项目：vityBot 作者: GDGVIT 项目源码文件源码阅读 15 收藏 0 点赞 0 评论 0

def find_match(query_string):
    """
    find the matching faculty object from the query string
    :param query_string:
    :return: faculty dictionary object
    """
    global data

    max_ratio = 0
    max_faculty = None

    for faculty in data:
        ratio = fuzz.token_set_ratio(faculty['name'], query_string)

        if ratio > max_ratio:
            max_ratio = ratio
            max_faculty = faculty

    return max_faculty if max_ratio > 40 else None

actor.py 文件源码项目：saapy 作者: ashapochka 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def compare_names(name1: ParsedName, name2: ParsedName):
        if proper(name1) and proper(name2):
            compare = fuzz.token_set_ratio
        else:
            compare = fuzz.ratio
        return compare(name1.name, name2.name)

gridding.py 文件源码项目：the-magical-csv-merge-machine 作者: entrepreneur-interet-general 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def score_tokens(src, ref, translate_tokens):
    if translate_tokens:
        return score_tokens(translate(src), translate(ref), False)
    # Returns a score in [0, 100]
    aTokens = validateTokens(src)
    bTokens = validateTokens(ref)
    a2 = ' '.join(aTokens)
    b2 = ' '.join(bTokens)
    tokenSortRatio = fuzz.token_sort_ratio(a2, b2)
    if tokenSortRatio < 40: 
        logging.debug('Rejected for TOKEN_SORT : {} / {}'.format(src, ref))
        return 0
    tokenSetRatio = fuzz.token_set_ratio(a2, b2)
    if tokenSetRatio < 50:
        logging.debug('Rejected for TOKEN_SET : {} / {}'.format(src, ref))
        return 0
    if REQUIRES_SHARED_PROPER_NOUN:
        aProper = ' '.join(filterProperNouns(aTokens))
        bProper = ' '.join(filterProperNouns(bTokens))
        # if(len(aProper) > 3 and len(bProper) > 3):
        if len(aProper) > 0 or len(bProper) > 0:
            properNounSortRatio = fuzz.token_sort_ratio(aProper, bProper)
            if properNounSortRatio < 80: 
                logging.debug('Rejected for PROPER_NOUN_SORT : {} / {}'.format(src, ref))
                return 0
            properNounSetRatio = fuzz.token_set_ratio(aProper, bProper)
            if properNounSetRatio < 60:
                logging.debug('Rejected for PROPER_NOUN_SET : {} / {}'.format(src, ref))
                return 0
    return tokenSortRatio * tokenSetRatio / 100

find_DB.py 文件源码项目：vityBot 作者: GDGVIT 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def find_match(query, intent):
    global col
    doc = None
    max_ratio = 0

    for d in col.find({"intent": intent}):
        ratio = fuzz.token_set_ratio(d['text'], query)

        if ratio > max_ratio:
            max_ratio = ratio
            doc = d

    del doc['_id']
    return doc['answer']

string_functions.py 文件源码项目：vityBot 作者: GDGVIT 项目源码文件源码阅读 15 收藏 0 点赞 0 评论 0

def find_match(course_list, query_string):
    """
    find the most matching course for a given name and return the course
    :param course_list: list of courses
    :param query_string: query of the user
    :return: course object
    """

    max_out = 0  # the max ratio among the courses
    max_course = None

    for course in course_list:
        if 'lab' not in query_string.lower():
            if course.subject_type == 'Embedded Lab':
                continue
        else:
            if course.subject_type == 'Embedded Theory':
                continue

        max_in = 0  # the max ratio among different names of the course

        for name in course.names:
            ratio = fuzz.token_set_ratio(name, query_string)

            if ratio > max_in:
                max_in = ratio

        if max_out < max_in:
            max_out = max_in
            max_course = course

    return max_course if max_out > 50 else None

utils.py 文件源码项目：ModTools 作者: MattBSG 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def strict_compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.partial_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.partial_ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio

utils.py 文件源码项目：ModTools 作者: MattBSG 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio

FuzzyWuzzy_1.py 文件源码项目：Question-Answering-System 作者: AdityaAS 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def enter(MSG):
    """
    This function takes a string (MSG) and tries to answer the query by looking through the dictionaries in the program (after some preprocessing).
    It tries to mine out the correct response by performing pattern matching through the structured data
    """
    msg=MSG.lower()
    if msg[-1]=='?':
        msg=msg[:-1]
    tokens=nltk.word_tokenize(msg)
    for i in words:
        while (i in tokens):
            tokens.remove(i)
    lst=[]
    flag=0
    if tokens[0]=="who":
        lst=data_who
    elif tokens[0]=="what":
        lst=data_what
    elif tokens[0]=='how':
        lst=data_how
    #msg=str(tokens)
    msg=' '.join(tokens[1:])    
    for i in lst:
        if fuzz.token_set_ratio(i[0],msg)>=60:
            print i[1]
            flag=1
            break
    if flag==0:
        print "Question Not found"

vtTool.py 文件源码项目：Snakepit 作者: K4lium 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def computeSimilarity(s1, s2):
    return 1.0 - (0.01 * max(
        fuzz.ratio(s1, s2),
        fuzz.token_sort_ratio(s1, s2),
        fuzz.token_set_ratio(s1, s2)))

searcher.py 文件源码项目：watcher 作者: nosmokingbandit 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def _match_torrent_name(self, movie_title, movie_year, torrent_title):
        ''' Checks if movie_title and torrent_title are a good match
        movie_title: str title of movie
        movie_year: str year of movie release
        torrent_title: str title of torrent

        Helper function for rss_sync.

        Since torrent indexers don't supply imdbid like NewzNab does we have to compare
            the titles to find a match. This should be fairly accurate since a backlog
            search uses name and year to find releases.

        Checks if the year is in the title, promptly ignores it if the year is not found.
        Then does a fuzzy title match looking for 80+ token set ratio.

        Returns bool on match success
        '''

        if movie_year not in torrent_title:
            return False
        else:
            title = movie_title.replace(':', '.').replace(' ', '.').lower()
            torrent = torrent_title.replace(' ', '.').replace(':', '.').lower()
            match = fuzz.token_set_ratio(title, torrent)
            if match > 80:
                return True
            else:
                return False

fuzzy.py 文件源码项目：kaggle-quora-question-pairs 作者: stys 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def main(conf):
    dump_dir = conf['fuzzy.dump.dir']
    makedirs(dump_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['fuzzy.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['fuzzy.dataset'])

    compute_features(train_df, test_df)

    logging.info('Writing train dataset to disk')
    train_df[[
        FieldsTrain.id,
        FieldsTrain.is_duplicate,
        Fields.qratio,
        Fields.wratio,
        Fields.partial_ratio,
        Fields.partial_token_set_ratio,
        Fields.partial_token_sort_ratio,
        Fields.token_set_ratio,
        Fields.token_sort_ratio
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Writing test dataset to disk')
    test_df[[
        FieldsTest.test_id,
        Fields.qratio,
        Fields.wratio,
        Fields.partial_ratio,
        Fields.partial_token_set_ratio,
        Fields.partial_token_sort_ratio,
        Fields.token_set_ratio,
        Fields.token_sort_ratio
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)

_matchblock.py 文件源码项目：matchtools 作者: matchtools 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def compare_strings(cls, string1, string2, *, tolerance=None,
                        method='uwratio'):
        """
        Check if the strings provided have a similarity ratio within the
        specified tolerance.

        Return True if yes, otherwise return False.

        Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy).

        :param string1: str
        :param string2: str
        :param tolerance: number
        :param method: str, one of: 'uwratio', 'partial_ratio',
                                    'token_sort_ratio', 'token_set_ratio',
                                    'ratio'
        :rtype: bool

        :Example:

        >>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10)
        True

        >>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio')
        False
        """

        str_number = any(
            char.isdigit() for string in (string1, string2) for char in string)

        if tolerance is None:
            if str_number:
                tolerance = cls.str_number_tolerance
            else:
                tolerance = cls.string_tolerance

        if not str_number:
            if cls.is_abbreviation(string1, string2):
                return True

        methods = {'uwratio': fuzz.UWRatio,
                   'partial_ratio': fuzz.partial_ratio,
                   'token_sort_ratio': fuzz.token_sort_ratio,
                   'token_set_ratio': fuzz.token_set_ratio,
                   'ratio': fuzz.ratio}

        if method not in methods:
            msg = 'wrong method, use available: {}'
            raise ValueError(msg.format(', '.join(sorted(methods))))

        return methods[method](string1, string2) >= 100 - tolerance

fuzzy.py 文件源码项目：kaggle-quora-question-pairs 作者: stys 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def compute_features(train_df, test_df):

    train_df[Fields.qratio] = train_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.qratio] = test_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_qratio = compute_quality(train_df, Fields.qratio)

    train_df[Fields.wratio] = train_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.wratio] = test_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_wratio = compute_quality(train_df, Fields.wratio)

    train_df[Fields.partial_ratio] = train_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_ratio] = test_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)

    train_df[Fields.partial_token_set_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_set_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)

    train_df[Fields.partial_token_sort_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_sort_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)

    train_df[Fields.token_set_ratio] = train_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_set_ratio] = test_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)

    train_df[Fields.token_sort_ratio] = train_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_sort_ratio] = test_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)

    quality = dict(
        quality_qratio=quality_qratio,
        quality_wratio=quality_wratio,
        quality_partial_ratio=quality_partial_ratio,
        quality_partial_token_set_ratio=quality_partial_token_set_ratio,
        quality_partial_token_sort_ratio=quality_partial_token_sort_ratio,
        quality_token_set_ratio=quality_token_set_ratio,
        quality_token_sort_ratio=quality_token_sort_ratio
    )

    return quality