def _compute_author_similarity(self, paired_authors):
def row_similarity(row):
same_email = row.author_email == row.author_email_other
name_similarity = fuzz.token_set_ratio(row.author_name,
row.author_name_other)
email_name_similarity = fuzz.ratio(row.email_name,
row.email_name_other)
name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
row.name_from_email_other)
return pd.Series(
[same_email, name_similarity, email_name_similarity,
name_to_email_similarity])
newcols = paired_authors.apply(row_similarity, axis=1)
newcols.columns = ['same_email', 'name_similarity',
'email_name_similarity', 'name_to_email_similarity']
newdf = paired_authors.join(newcols)
return newdf
python类token_set_ratio()的实例源码
def extract_features(df):
df["question1"] = df["question1"].fillna("").apply(preprocess)
df["question2"] = df["question2"].fillna("").apply(preprocess)
print("token features...")
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df
def find_match(query_string):
"""
find the matching faculty object from the query string
:param query_string:
:return: faculty dictionary object
"""
global data
max_ratio = 0
max_faculty = None
for faculty in data:
ratio = fuzz.token_set_ratio(faculty['name'], query_string)
if ratio > max_ratio:
max_ratio = ratio
max_faculty = faculty
return max_faculty if max_ratio > 40 else None
def compare_names(name1: ParsedName, name2: ParsedName):
if proper(name1) and proper(name2):
compare = fuzz.token_set_ratio
else:
compare = fuzz.ratio
return compare(name1.name, name2.name)
gridding.py 文件源码
项目:the-magical-csv-merge-machine
作者: entrepreneur-interet-general
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def score_tokens(src, ref, translate_tokens):
if translate_tokens:
return score_tokens(translate(src), translate(ref), False)
# Returns a score in [0, 100]
aTokens = validateTokens(src)
bTokens = validateTokens(ref)
a2 = ' '.join(aTokens)
b2 = ' '.join(bTokens)
tokenSortRatio = fuzz.token_sort_ratio(a2, b2)
if tokenSortRatio < 40:
logging.debug('Rejected for TOKEN_SORT : {} / {}'.format(src, ref))
return 0
tokenSetRatio = fuzz.token_set_ratio(a2, b2)
if tokenSetRatio < 50:
logging.debug('Rejected for TOKEN_SET : {} / {}'.format(src, ref))
return 0
if REQUIRES_SHARED_PROPER_NOUN:
aProper = ' '.join(filterProperNouns(aTokens))
bProper = ' '.join(filterProperNouns(bTokens))
# if(len(aProper) > 3 and len(bProper) > 3):
if len(aProper) > 0 or len(bProper) > 0:
properNounSortRatio = fuzz.token_sort_ratio(aProper, bProper)
if properNounSortRatio < 80:
logging.debug('Rejected for PROPER_NOUN_SORT : {} / {}'.format(src, ref))
return 0
properNounSetRatio = fuzz.token_set_ratio(aProper, bProper)
if properNounSetRatio < 60:
logging.debug('Rejected for PROPER_NOUN_SET : {} / {}'.format(src, ref))
return 0
return tokenSortRatio * tokenSetRatio / 100
def find_match(query, intent):
global col
doc = None
max_ratio = 0
for d in col.find({"intent": intent}):
ratio = fuzz.token_set_ratio(d['text'], query)
if ratio > max_ratio:
max_ratio = ratio
doc = d
del doc['_id']
return doc['answer']
def find_match(course_list, query_string):
"""
find the most matching course for a given name and return the course
:param course_list: list of courses
:param query_string: query of the user
:return: course object
"""
max_out = 0 # the max ratio among the courses
max_course = None
for course in course_list:
if 'lab' not in query_string.lower():
if course.subject_type == 'Embedded Lab':
continue
else:
if course.subject_type == 'Embedded Theory':
continue
max_in = 0 # the max ratio among different names of the course
for name in course.names:
ratio = fuzz.token_set_ratio(name, query_string)
if ratio > max_in:
max_in = ratio
if max_out < max_in:
max_out = max_in
max_course = course
return max_course if max_out > 50 else None
def strict_compare_strings(string_one, string_two):
highest_ratio = 0
if fuzz.ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.ratio(string_one, string_two)
if fuzz.partial_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.partial_ratio(string_one, string_two)
if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
if fuzz.token_set_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.token_set_ratio(string_one, string_two)
return highest_ratio
def compare_strings(string_one, string_two):
highest_ratio = 0
if fuzz.ratio(string_one, string_two)>highest_ratio:
highest_ratio = fuzz.ratio(string_one, string_two)
if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
highest_ratio = fuzz.token_set_ratio(string_one, string_two)
return highest_ratio
def enter(MSG):
"""
This function takes a string (MSG) and tries to answer the query by looking through the dictionaries in the program (after some preprocessing).
It tries to mine out the correct response by performing pattern matching through the structured data
"""
msg=MSG.lower()
if msg[-1]=='?':
msg=msg[:-1]
tokens=nltk.word_tokenize(msg)
for i in words:
while (i in tokens):
tokens.remove(i)
lst=[]
flag=0
if tokens[0]=="who":
lst=data_who
elif tokens[0]=="what":
lst=data_what
elif tokens[0]=='how':
lst=data_how
#msg=str(tokens)
msg=' '.join(tokens[1:])
for i in lst:
if fuzz.token_set_ratio(i[0],msg)>=60:
print i[1]
flag=1
break
if flag==0:
print "Question Not found"
def computeSimilarity(s1, s2):
return 1.0 - (0.01 * max(
fuzz.ratio(s1, s2),
fuzz.token_sort_ratio(s1, s2),
fuzz.token_set_ratio(s1, s2)))
def _match_torrent_name(self, movie_title, movie_year, torrent_title):
''' Checks if movie_title and torrent_title are a good match
movie_title: str title of movie
movie_year: str year of movie release
torrent_title: str title of torrent
Helper function for rss_sync.
Since torrent indexers don't supply imdbid like NewzNab does we have to compare
the titles to find a match. This should be fairly accurate since a backlog
search uses name and year to find releases.
Checks if the year is in the title, promptly ignores it if the year is not found.
Then does a fuzzy title match looking for 80+ token set ratio.
Returns bool on match success
'''
if movie_year not in torrent_title:
return False
else:
title = movie_title.replace(':', '.').replace(' ', '.').lower()
torrent = torrent_title.replace(' ', '.').replace(':', '.').lower()
match = fuzz.token_set_ratio(title, torrent)
if match > 80:
return True
else:
return False
def main(conf):
dump_dir = conf['fuzzy.dump.dir']
makedirs(dump_dir)
logging.info('Loading train dataset')
train_df = load_train_df(conf['fuzzy.dataset'])
logging.info('Loading test dataset')
test_df = load_test_df(conf['fuzzy.dataset'])
compute_features(train_df, test_df)
logging.info('Writing train dataset to disk')
train_df[[
FieldsTrain.id,
FieldsTrain.is_duplicate,
Fields.qratio,
Fields.wratio,
Fields.partial_ratio,
Fields.partial_token_set_ratio,
Fields.partial_token_sort_ratio,
Fields.token_set_ratio,
Fields.token_sort_ratio
]].to_csv(join_path(dump_dir, 'train.csv'), index=False)
logging.info('Writing test dataset to disk')
test_df[[
FieldsTest.test_id,
Fields.qratio,
Fields.wratio,
Fields.partial_ratio,
Fields.partial_token_set_ratio,
Fields.partial_token_sort_ratio,
Fields.token_set_ratio,
Fields.token_sort_ratio
]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def compare_strings(cls, string1, string2, *, tolerance=None,
method='uwratio'):
"""
Check if the strings provided have a similarity ratio within the
specified tolerance.
Return True if yes, otherwise return False.
Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy).
:param string1: str
:param string2: str
:param tolerance: number
:param method: str, one of: 'uwratio', 'partial_ratio',
'token_sort_ratio', 'token_set_ratio',
'ratio'
:rtype: bool
:Example:
>>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10)
True
>>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio')
False
"""
str_number = any(
char.isdigit() for string in (string1, string2) for char in string)
if tolerance is None:
if str_number:
tolerance = cls.str_number_tolerance
else:
tolerance = cls.string_tolerance
if not str_number:
if cls.is_abbreviation(string1, string2):
return True
methods = {'uwratio': fuzz.UWRatio,
'partial_ratio': fuzz.partial_ratio,
'token_sort_ratio': fuzz.token_sort_ratio,
'token_set_ratio': fuzz.token_set_ratio,
'ratio': fuzz.ratio}
if method not in methods:
msg = 'wrong method, use available: {}'
raise ValueError(msg.format(', '.join(sorted(methods))))
return methods[method](string1, string2) >= 100 - tolerance
def compute_features(train_df, test_df):
train_df[Fields.qratio] = train_df.apply(
lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.qratio] = test_df.apply(
lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_qratio = compute_quality(train_df, Fields.qratio)
train_df[Fields.wratio] = train_df.apply(
lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.wratio] = test_df.apply(
lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_wratio = compute_quality(train_df, Fields.wratio)
train_df[Fields.partial_ratio] = train_df.apply(
lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_ratio] = test_df.apply(
lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)
train_df[Fields.partial_token_set_ratio] = train_df.apply(
lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_token_set_ratio] = test_df.apply(
lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)
train_df[Fields.partial_token_sort_ratio] = train_df.apply(
lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_token_sort_ratio] = test_df.apply(
lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)
train_df[Fields.token_set_ratio] = train_df.apply(
lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.token_set_ratio] = test_df.apply(
lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)
train_df[Fields.token_sort_ratio] = train_df.apply(
lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.token_sort_ratio] = test_df.apply(
lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)
quality = dict(
quality_qratio=quality_qratio,
quality_wratio=quality_wratio,
quality_partial_ratio=quality_partial_ratio,
quality_partial_token_set_ratio=quality_partial_token_set_ratio,
quality_partial_token_sort_ratio=quality_partial_token_sort_ratio,
quality_token_set_ratio=quality_token_set_ratio,
quality_token_sort_ratio=quality_token_sort_ratio
)
return quality