def extract_features(df):
df["question1"] = df["question1"].fillna("").apply(preprocess)
df["question2"] = df["question2"].fillna("").apply(preprocess)
print("token features...")
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df
python类partial_ratio()的实例源码
def get_combined_fuzz_score(a, b, **kwargs):
a = clean_name(a)
b = clean_name(b)
if 'simple' in kwargs:
w_simple = float(kwargs['simple'])
else:
w_simple = float(1)
if 'partial' in kwargs:
w_partial = float(kwargs['partial'])
else:
w_partial = float(1)
simple = fuzz.ratio(a, b) * w_simple
partial = fuzz.partial_ratio(a, b) * w_partial
combined = float(simple) * float(partial) / float(10000)
return combined
def search(self, targets, partial=True, fuzzy=False):
allInstances = self.instances()
matchedInstances = set()
for host in targets:
for instance in allInstances:
names = [instance.name]
if instance.aliases != None:
names += list(instance.aliases)
for name in names:
if host.lower() == name.lower():
matchedInstances.add((100, instance))
elif partial and host.lower() in name.lower():
matchedInstances.add((99, instance))
if fuzzy:
score = fuzz.partial_ratio(host.lower(), name.lower())
if score > 85 or host.lower() in name.lower():
matchedInstances.add((score, instance))
# it is possible for the same instance to be matched, if so, it should only
# appear on the return list once (still ordered by the most probable match)
return list(collections.OrderedDict([(v, None) for k, v in sorted(list(matchedInstances))]).keys())
gridding.py 文件源码
项目:the-magical-csv-merge-machine
作者: entrepreneur-interet-general
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def score_chars(src, ref):
# Returns a score in [0, 100]
a0 = toASCII(src)
b0 = toASCII(ref)
a1 = acronymizePhrase(a0)
b1 = acronymizePhrase(b0)
if len(a1) > 0 and len(b1) > 0 and (a1 == b0.upper() or a0.upper() == b1):
logging.debug('Accepted for ACRO : {} / {}'.format(a, b))
return 100
a = justCase(src)
b = justCase(ref)
absCharRatio = fuzz.ratio(a, b)
if absCharRatio < 20:
logging.debug('Rejected for ABS : {} / {}'.format(a, b))
return 0
partialCharRatio = fuzz.partial_ratio(a, b)
if partialCharRatio < 30:
logging.debug('Rejected for PARTIAL : {} / {}'.format(a, b))
return 0
return absCharRatio * partialCharRatio / 100
def fuzzy_feats(train_in, test_in, qcolumns = ['question1', 'question2'], append=''):
from fuzzywuzzy import fuzz
import pandas as pd
train = train_in.copy().loc[:,qcolumns]
test = test_in.copy().loc[:,qcolumns]
train['fuzz_r'+append] = train.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
train['fuzz_pr'+append] = train.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
train['fuzz_tsr'+append] = train.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
train['fuzz_tsor'+append] = train.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
test['fuzz_r'+append] = test.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
test['fuzz_pr'+append] = test.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
test['fuzz_tsr'+append] = test.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
test['fuzz_tsor'+append] = test.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
return (train, test)
def filterModule(self, module):
ratio = 0
compatibleType = False
if "type" in self.modfilter:
if self.modfilter["type"]["dir"] == "input":
for input in module.inputDefs:
if input.pintype == self.modfilter["type"]["type"]:
compatibleType = True
break
elif self.modfilter["type"]["dir"] == "output":
for output in module.outputDefs:
if output.pintype == self.modfilter["type"]["type"]:
compatibleType = True
break
if not compatibleType:
return False
if "text" in self.modfilter: # Filter by text input
if self.modfilter["text"] in module.name:
return True
if not self.modfilter["text"]: # Text entry is empty
return True
ratio = fuzz.ratio(self.modfilter["text"], module.name)
ratio = max(ratio, fuzz.partial_ratio(self.modfilter["text"], module.desc))
else:
return True # Don't filter by text? Return all remaining
if ratio > 40:
return True
else:
return False
def similarity(n1, n2):
"""
Returns the mean of the partial_ratio score for each field in the two
entities. Note that if they don't have fields that match, the score will
be zero.
"""
scores = [
fuzz.partial_ratio(n1, n2)
]
return float(sum(s for s in scores)) / float(len(scores))
preprocess_fields_v3.py 文件源码
项目:the-magical-csv-merge-machine
作者: entrepreneur-interet-general
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def address_filter_score(src, ref):
a1, a2 = case_phrase(src), case_phrase(ref)
return fuzz.partial_ratio(a1, a2) + fuzz.ratio(a1, a2)
# Acronym handling
def fuzzy_matches_in_sentence(self, skill, sentence):
N = len(skill.split())
doc = self.ngrams(sentence, N)
doc_join = [b" ".join(d) for d in doc]
for dj in doc_join:
ratio = fuzz.partial_ratio(skill, dj)
if ratio > 88:
yield CandidateSkill(
skill_name=skill,
matched_skill=dj,
confidence=ratio,
context=sentence.decode('utf-8')
)
def candidate_skills(self, job_posting):
document = job_posting.text
sentences = self.ie_preprocess(document)
for skill in self.lookup:
len_skill = len(skill.split())
for sent in sentences:
sent = sent.encode('utf-8')
# Exact matching
if len_skill == 1:
sent = sent.decode('utf-8')
if re.search(r'\b' + skill + r'\b', sent, re.IGNORECASE):
yield CandidateSkill(
skill_name=skill,
matched_skill=skill,
confidence=100,
context=sent
)
# Fuzzy matching
else:
ratio = fuzz.partial_ratio(skill, sent)
# You can adjust the partial of matching here:
# 100 => exact matching 0 => no matching
if ratio > 88:
for match in self.fuzzy_matches_in_sentence(skill, sent):
yield match
def is_aligned_arg(x, y):
"""
Return whether these two arguments are aligned: they occur in the same WordNet synset.
:param x: the first argument
:param y: the second argument
:return: Whether they are aligned
"""
global nlp
# Allow partial matching
if fuzz.partial_ratio(' ' + x + ' ', ' ' + y + ' ') == 100:
return True
x_words = [w for w in x.split() if not nlp.is_stop(w)]
y_words = [w for w in y.split() if not nlp.is_stop(w)]
if len(x_words) == 0 or len(y_words) == 0:
return False
x_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
for w in x_words]
y_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
for w in y_words]
# One word - check whether there is intersection between synsets
if len(x_synonyms) == 1 and len(y_synonyms) == 1 and \
len([w for w in x_synonyms[0].intersection(y_synonyms[0]) if not nlp.is_stop(w)]) > 0:
return True
# More than one word - align words from x with words from y
intersections = [len([w for w in s1.intersection(s2) if not nlp.is_stop(w)])
for s1 in x_synonyms for s2 in y_synonyms]
if len([intersection_len for intersection_len in intersections if intersection_len > 0]) >= \
0.75 * max(len(x_synonyms), len(y_synonyms)):
return True
return False
def strict_compare_strings(string_one, string_two):
highest_ratio = 0
if fuzz.ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.ratio(string_one, string_two)
if fuzz.partial_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.partial_ratio(string_one, string_two)
if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
if fuzz.token_set_ratio(string_one, string_two) > highest_ratio:
highest_ratio = fuzz.token_set_ratio(string_one, string_two)
return highest_ratio
def fuzzy_title(self, titles):
''' Score and remove results based on title match
titles: list of titles to match against
If titles is an empty list every result is treated as a perfect match
Iterates through self.results and removes any entry that does not
fuzzy match 'title' > 60.
Adds fuzzy_score / 20 points to ['score']
*If title is passed as None, assumes perfect match and scores +20
Does not return
'''
logging.info(u'Checking title match.')
lst = []
if titles == []:
for result in self.results:
result['score'] += 20
lst.append(result)
else:
for result in self.results:
if result['type'] == 'import' and result not in lst:
result['score'] += 20
lst.append(result)
continue
test = Url.encode(result['title'])
matches = [fuzz.partial_ratio(Url.encode(title), test) for title in titles]
if any([match > 70 for match in matches]):
result['score'] += (max(matches) / 5)
lst.append(result)
else:
logging.debug(u'{} best title match was {}%, removing search result.'.format(test, max(matches)))
self.results = lst
logging.info(u'Keeping {} results.'.format(len(self.results)))
def fuzzy_match(self, items, test):
''' Fuzzy matches title with predb rss titles
:param items: list of titles in predb rss
:param test: str to match to rss titles
Returns bool if any one 'items' fuzzy matches above 50%
'''
for item in items:
match = fuzz.partial_ratio(item, test)
if match > 50:
return True
return False
def reverseLinking(sent, text_candidate):
tokens = sent.split()
label = ["O"] * len(tokens)
text_attention_indices = None
exact_match = False
if text_candidate is None or len(text_candidate) == 0:
return '<UNK>', label, exact_match
# sorted by length
for text in sorted(text_candidate, key=lambda x:len(x), reverse=True):
pattern = r'(^|\s)(%s)($|\s)' % (re.escape(text))
if re.search(pattern, sent):
text_attention_indices = get_indices(tokens, text.split())
break
if text_attention_indices != None:
exact_match = True
for i in text_attention_indices:
label[i] = 'I'
else:
try:
v, score = process.extractOne(sent, text_candidate, scorer=fuzz.partial_ratio)
except:
print("Extraction Error with FuzzyWuzzy : {} || {}".format(sent, text_candidate))
return '<UNK>', label, exact_match
v = v.split()
n_gram_candidate = get_ngram(tokens)
n_gram_candidate = sorted(n_gram_candidate, key=lambda x: fuzz.ratio(x[0], v), reverse=True)
top = n_gram_candidate[0]
for i in range(top[1], top[2]):
label[i] = 'I'
entity_text = []
for l, t in zip(label, tokens):
if l == 'I':
entity_text.append(t)
entity_text = " ".join(entity_text)
label = " ".join(label)
return entity_text, label, exact_match
def main(conf):
dump_dir = conf['fuzzy.dump.dir']
makedirs(dump_dir)
logging.info('Loading train dataset')
train_df = load_train_df(conf['fuzzy.dataset'])
logging.info('Loading test dataset')
test_df = load_test_df(conf['fuzzy.dataset'])
compute_features(train_df, test_df)
logging.info('Writing train dataset to disk')
train_df[[
FieldsTrain.id,
FieldsTrain.is_duplicate,
Fields.qratio,
Fields.wratio,
Fields.partial_ratio,
Fields.partial_token_set_ratio,
Fields.partial_token_sort_ratio,
Fields.token_set_ratio,
Fields.token_sort_ratio
]].to_csv(join_path(dump_dir, 'train.csv'), index=False)
logging.info('Writing test dataset to disk')
test_df[[
FieldsTest.test_id,
Fields.qratio,
Fields.wratio,
Fields.partial_ratio,
Fields.partial_token_set_ratio,
Fields.partial_token_sort_ratio,
Fields.token_set_ratio,
Fields.token_sort_ratio
]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def compare_strings(cls, string1, string2, *, tolerance=None,
method='uwratio'):
"""
Check if the strings provided have a similarity ratio within the
specified tolerance.
Return True if yes, otherwise return False.
Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy).
:param string1: str
:param string2: str
:param tolerance: number
:param method: str, one of: 'uwratio', 'partial_ratio',
'token_sort_ratio', 'token_set_ratio',
'ratio'
:rtype: bool
:Example:
>>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10)
True
>>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio')
False
"""
str_number = any(
char.isdigit() for string in (string1, string2) for char in string)
if tolerance is None:
if str_number:
tolerance = cls.str_number_tolerance
else:
tolerance = cls.string_tolerance
if not str_number:
if cls.is_abbreviation(string1, string2):
return True
methods = {'uwratio': fuzz.UWRatio,
'partial_ratio': fuzz.partial_ratio,
'token_sort_ratio': fuzz.token_sort_ratio,
'token_set_ratio': fuzz.token_set_ratio,
'ratio': fuzz.ratio}
if method not in methods:
msg = 'wrong method, use available: {}'
raise ValueError(msg.format(', '.join(sorted(methods))))
return methods[method](string1, string2) >= 100 - tolerance
def partial_match(x, y):
"""
Return whether these two mentions have a partial match in WordNet synset.
:param x: the first mention
:param y: the second mention
:return: Whether they are aligned
"""
# Allow partial matching
if fuzz.partial_ratio(' ' + x + ' ', ' ' + y + ' ') == 100:
return True
x_words = [w for w in x.split() if not is_stop(w)]
y_words = [w for w in y.split() if not is_stop(w)]
if len(x_words) == 0 or len(y_words) == 0:
return False
x_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
for w in x_words]
y_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
for w in y_words]
# One word - check whether there is intersection between synsets
if len(x_synonyms) == 1 and len(y_synonyms) == 1 and \
len([w for w in x_synonyms[0].intersection(y_synonyms[0]) if not is_stop(w)]) > 0:
return True
# More than one word - align words from x with words from y
cost = -np.vstack([np.array([len([w for w in s1.intersection(s2) if not is_stop(w)]) for s1 in x_synonyms])
for s2 in y_synonyms])
m = Munkres()
cost = pad_to_square(cost)
indices = m.compute(cost)
# Compute the average score of the alignment
average_score = np.mean([-cost[row, col] for row, col in indices])
if average_score >= 0.75:
return True
return False
def compute_features(train_df, test_df):
train_df[Fields.qratio] = train_df.apply(
lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.qratio] = test_df.apply(
lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_qratio = compute_quality(train_df, Fields.qratio)
train_df[Fields.wratio] = train_df.apply(
lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.wratio] = test_df.apply(
lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_wratio = compute_quality(train_df, Fields.wratio)
train_df[Fields.partial_ratio] = train_df.apply(
lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_ratio] = test_df.apply(
lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)
train_df[Fields.partial_token_set_ratio] = train_df.apply(
lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_token_set_ratio] = test_df.apply(
lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)
train_df[Fields.partial_token_sort_ratio] = train_df.apply(
lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_token_sort_ratio] = test_df.apply(
lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)
train_df[Fields.token_set_ratio] = train_df.apply(
lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.token_set_ratio] = test_df.apply(
lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)
train_df[Fields.token_sort_ratio] = train_df.apply(
lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.token_sort_ratio] = test_df.apply(
lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)
quality = dict(
quality_qratio=quality_qratio,
quality_wratio=quality_wratio,
quality_partial_ratio=quality_partial_ratio,
quality_partial_token_set_ratio=quality_partial_token_set_ratio,
quality_partial_token_sort_ratio=quality_partial_token_sort_ratio,
quality_token_set_ratio=quality_token_set_ratio,
quality_token_sort_ratio=quality_token_sort_ratio
)
return quality