def autorsAmendment(self,autors):
#formatea si es un diputado de forma que se pueda buscar en la bd
strip=autors[0].strip()
typeaut = self.typeAutor(name=strip)
if typeaut is not 'grupo':
max = 0
member = None
for memb in self.members:
ratio = fuzz.token_sort_ratio(strip, memb['nombre'])
if ratio > max:
member = memb
max = ratio
return member['nombre']
else:
return strip
python类token_sort_ratio()的实例源码
def extract_features(df):
df["question1"] = df["question1"].fillna("").apply(preprocess)
df["question2"] = df["question2"].fillna("").apply(preprocess)
print("token features...")
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df
def evaluate_patch_pair(thresholds, lhs, rhs):
left_message, left_diff = lhs
right_message, right_diff = rhs
left_diff_lines = left_diff.lines
right_diff_lines = right_diff.lines
diff_lines_ratio = min(left_diff_lines, right_diff_lines) / max(left_diff_lines, right_diff_lines)
# get rating of message
msg_rating = fuzz.token_sort_ratio(left_message, right_message) / 100
# Skip on diff_lines_ratio less than 1%
if diff_lines_ratio < 0.01:
return SimRating(msg_rating, 0, diff_lines_ratio)
# get rating of diff
diff_rating = rate_diffs(thresholds, left_diff, right_diff)
return SimRating(msg_rating, diff_rating, diff_lines_ratio)
def fuzzy_match_strings(ref, val):
"""
Returns the matching score of two values.
"""
if not ref or not val:
return 0
ref_q = to_q(ref)
val_q = to_q(val)
if ref_q or val_q:
return 100 if ref_q == val_q else 0
simplified_val = unidecode(val).lower()
simplified_ref = unidecode(ref).lower()
# Return symmetric score
r1 = fuzz.token_sort_ratio(simplified_val, simplified_ref)
r2 = fuzz.token_sort_ratio(simplified_ref, simplified_val)
r2 = r1
return int(0.5*(r1+r2))
def matchautorgroup(self,lists):
all = self.members+self.groups
res = []
for element in lists:
member = None
max = 0
for memb in all:
ratio = fuzz.token_sort_ratio(element, memb['nombre'])
if ratio > max:
member = memb
max = ratio
res.append(member)
return res
gridding.py 文件源码
项目:the-magical-csv-merge-machine
作者: entrepreneur-interet-general
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def score_tokens(src, ref, translate_tokens):
if translate_tokens:
return score_tokens(translate(src), translate(ref), False)
# Returns a score in [0, 100]
aTokens = validateTokens(src)
bTokens = validateTokens(ref)
a2 = ' '.join(aTokens)
b2 = ' '.join(bTokens)
tokenSortRatio = fuzz.token_sort_ratio(a2, b2)
if tokenSortRatio < 40:
logging.debug('Rejected for TOKEN_SORT : {} / {}'.format(src, ref))
return 0
tokenSetRatio = fuzz.token_set_ratio(a2, b2)
if tokenSetRatio < 50:
logging.debug('Rejected for TOKEN_SET : {} / {}'.format(src, ref))
return 0
if REQUIRES_SHARED_PROPER_NOUN:
aProper = ' '.join(filterProperNouns(aTokens))
bProper = ' '.join(filterProperNouns(bTokens))
# if(len(aProper) > 3 and len(bProper) > 3):
if len(aProper) > 0 or len(bProper) > 0:
properNounSortRatio = fuzz.token_sort_ratio(aProper, bProper)
if properNounSortRatio < 80:
logging.debug('Rejected for PROPER_NOUN_SORT : {} / {}'.format(src, ref))
return 0
properNounSetRatio = fuzz.token_set_ratio(aProper, bProper)
if properNounSetRatio < 60:
logging.debug('Rejected for PROPER_NOUN_SET : {} / {}'.format(src, ref))
return 0
return tokenSortRatio * tokenSetRatio / 100
def best_string_mapping(threshold, left_list, right_list):
"""
This function tries to find the closest mapping with the best weight of two lists of strings.
Example:
List A List B
0: 'abc' 'abc'
1: 'cde' 'cde'
2: 'fgh' 'fgh
3: 'fgj
map_lists will try to map each element of List A to an element of List B, in respect to the given threshold.
As a[{0,1,2}] == b[{0,1,2}], those values will automatically be mapped. Additionally, a[2] will also be mapped to
b[3], if the threshold is low enough (cf. 0.5).
"""
def injective_map(ll, rl, inverse_result=False):
ret = dict()
for l_entry in ll:
for r_entry in rl:
if l_entry == r_entry:
sim = 1
else:
sim = fuzz.token_sort_ratio(l_entry, r_entry) / 100
if sim < threshold:
continue
if l_entry in ret:
_, old_sim = ret[l_entry]
if sim < old_sim:
continue
ret[l_entry] = r_entry, sim
return {(r, l) if inverse_result else (l, r) for l, (r, _) in ret.items()}
return injective_map(left_list, right_list) | injective_map(right_list, left_list, True)
def rate_diffs(thresholds, l_diff, r_diff):
filename_compare = best_string_mapping(thresholds.filename, l_diff.patches.keys(), r_diff.patches.keys())
levenshteins = []
def compare_hunks(left, right):
# This case happens for example, if both hunks remove empty newlines
if left == right:
return 100
return fuzz.token_sort_ratio(left, right)
for l_filename, r_filename in filename_compare:
l_hunks = l_diff.patches[l_filename]
r_hunks = r_diff.patches[r_filename]
levenshtein = []
hunk_compare = best_string_mapping(thresholds.heading,
l_hunks.keys(), r_hunks.keys())
for l_hunk_heading, r_hunk_heading in hunk_compare:
lhunk = l_hunks[l_hunk_heading]
rhunk = r_hunks[r_hunk_heading]
if lhunk.deletions and rhunk.deletions:
levenshtein.append(compare_hunks(lhunk.deletions,
rhunk.deletions))
if lhunk.insertions and rhunk.insertions:
levenshtein.append(compare_hunks(lhunk.insertions,
rhunk.insertions))
if levenshtein:
levenshteins.append(mean(levenshtein))
if not levenshteins:
levenshteins = [0]
diff_rating = mean(levenshteins) / 100
return diff_rating
def preevaluate_filenames(thresholds, right_files, left_file):
candidates = []
for right_file in right_files:
if thresholds.filename >= 1.0:
if left_file != right_file:
continue
else:
sim = fuzz.token_sort_ratio(left_file, right_file) / 100
if sim < thresholds.filename:
continue
candidates.append(right_file)
return left_file, candidates
def strict_compare_strings(string_one, string_two):
highest_ratio = 0
if fuzz.ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.ratio(string_one, string_two)
if fuzz.partial_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.partial_ratio(string_one, string_two)
if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
if fuzz.token_set_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.token_set_ratio(string_one, string_two)
return highest_ratio
def compare_strings(string_one, string_two):
highest_ratio = 0
if fuzz.ratio(string_one, string_two)>highest_ratio:
highest_ratio = fuzz.ratio(string_one, string_two)
if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
highest_ratio = fuzz.token_set_ratio(string_one, string_two)
return highest_ratio
def get_CUL_score(record_elems, resp_elems):
if record_elems is None or resp_elems is None:
return None
elif isinstance(record_elems, str) and isinstance(resp_elems, str):
score = str(fuzz.token_sort_ratio(record_elems, resp_elems))
return score
elif isinstance(record_elems, str) and not isinstance(resp_elems, str):
scores = []
for n in range(len(resp_elems)):
score = str(fuzz.token_sort_ratio(record_elems, resp_elems[n]))
scores.append(score)
return max(scores)
elif not isinstance(record_elems, str) and isinstance(resp_elems, str):
scores = []
for n in range(len(record_elems)):
score = str(fuzz.token_sort_ratio(record_elems[n], resp_elems))
scores.append(score)
return max(scores)
elif not isinstance(record_elems, str) and not isinstance(resp_elems, str):
scores = []
for n in range(len(record_elems)):
for m in range(len(resp_elems)):
score = str(fuzz.token_sort_ratio(record_elems[n],
resp_elems[m]))
scores.append(score)
if scores != []:
return max(scores)
else:
return None
def computeSimilarity(s1, s2):
return 1.0 - (0.01 * max(
fuzz.ratio(s1, s2),
fuzz.token_sort_ratio(s1, s2),
fuzz.token_set_ratio(s1, s2)))
def sim(str1,str2):
return fuzz.token_sort_ratio(str1,str2) * 0.01
def match_fuzzy(self, frame):
pattern = process.extractOne(
frame.name, self._index_fuzzy,
scorer=fuzz.token_sort_ratio)
if not pattern or pattern[1] < MATCH_FUZZY_THRESHOLD:
return frame, set()
return frame, self._handlers[pattern[0]]
def main(conf):
dump_dir = conf['fuzzy.dump.dir']
makedirs(dump_dir)
logging.info('Loading train dataset')
train_df = load_train_df(conf['fuzzy.dataset'])
logging.info('Loading test dataset')
test_df = load_test_df(conf['fuzzy.dataset'])
compute_features(train_df, test_df)
logging.info('Writing train dataset to disk')
train_df[[
FieldsTrain.id,
FieldsTrain.is_duplicate,
Fields.qratio,
Fields.wratio,
Fields.partial_ratio,
Fields.partial_token_set_ratio,
Fields.partial_token_sort_ratio,
Fields.token_set_ratio,
Fields.token_sort_ratio
]].to_csv(join_path(dump_dir, 'train.csv'), index=False)
logging.info('Writing test dataset to disk')
test_df[[
FieldsTest.test_id,
Fields.qratio,
Fields.wratio,
Fields.partial_ratio,
Fields.partial_token_set_ratio,
Fields.partial_token_sort_ratio,
Fields.token_set_ratio,
Fields.token_sort_ratio
]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def is_fuzzy_matching_valid(self, stock_name, current_stock):
## Get the token sort ratio from fuzzywuzzy
ratio = fuzz.token_sort_ratio(stock_name, current_stock)
return ratio > 95
def compare_strings(cls, string1, string2, *, tolerance=None,
method='uwratio'):
"""
Check if the strings provided have a similarity ratio within the
specified tolerance.
Return True if yes, otherwise return False.
Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy).
:param string1: str
:param string2: str
:param tolerance: number
:param method: str, one of: 'uwratio', 'partial_ratio',
'token_sort_ratio', 'token_set_ratio',
'ratio'
:rtype: bool
:Example:
>>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10)
True
>>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio')
False
"""
str_number = any(
char.isdigit() for string in (string1, string2) for char in string)
if tolerance is None:
if str_number:
tolerance = cls.str_number_tolerance
else:
tolerance = cls.string_tolerance
if not str_number:
if cls.is_abbreviation(string1, string2):
return True
methods = {'uwratio': fuzz.UWRatio,
'partial_ratio': fuzz.partial_ratio,
'token_sort_ratio': fuzz.token_sort_ratio,
'token_set_ratio': fuzz.token_set_ratio,
'ratio': fuzz.ratio}
if method not in methods:
msg = 'wrong method, use available: {}'
raise ValueError(msg.format(', '.join(sorted(methods))))
return methods[method](string1, string2) >= 100 - tolerance
def compute_features(train_df, test_df):
train_df[Fields.qratio] = train_df.apply(
lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.qratio] = test_df.apply(
lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_qratio = compute_quality(train_df, Fields.qratio)
train_df[Fields.wratio] = train_df.apply(
lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.wratio] = test_df.apply(
lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_wratio = compute_quality(train_df, Fields.wratio)
train_df[Fields.partial_ratio] = train_df.apply(
lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_ratio] = test_df.apply(
lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)
train_df[Fields.partial_token_set_ratio] = train_df.apply(
lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_token_set_ratio] = test_df.apply(
lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)
train_df[Fields.partial_token_sort_ratio] = train_df.apply(
lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_token_sort_ratio] = test_df.apply(
lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)
train_df[Fields.token_set_ratio] = train_df.apply(
lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.token_set_ratio] = test_df.apply(
lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)
train_df[Fields.token_sort_ratio] = train_df.apply(
lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.token_sort_ratio] = test_df.apply(
lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)
quality = dict(
quality_qratio=quality_qratio,
quality_wratio=quality_wratio,
quality_partial_ratio=quality_partial_ratio,
quality_partial_token_set_ratio=quality_partial_token_set_ratio,
quality_partial_token_sort_ratio=quality_partial_token_sort_ratio,
quality_token_set_ratio=quality_token_set_ratio,
quality_token_sort_ratio=quality_token_sort_ratio
)
return quality