def _compute_author_similarity(self, paired_authors):
def row_similarity(row):
same_email = row.author_email == row.author_email_other
name_similarity = fuzz.token_set_ratio(row.author_name,
row.author_name_other)
email_name_similarity = fuzz.ratio(row.email_name,
row.email_name_other)
name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
row.name_from_email_other)
return pd.Series(
[same_email, name_similarity, email_name_similarity,
name_to_email_similarity])
newcols = paired_authors.apply(row_similarity, axis=1)
newcols.columns = ['same_email', 'name_similarity',
'email_name_similarity', 'name_to_email_similarity']
newdf = paired_authors.join(newcols)
return newdf
python类ratio()的实例源码
def get_combined_fuzz_score(a, b, **kwargs):
a = clean_name(a)
b = clean_name(b)
if 'simple' in kwargs:
w_simple = float(kwargs['simple'])
else:
w_simple = float(1)
if 'partial' in kwargs:
w_partial = float(kwargs['partial'])
else:
w_partial = float(1)
simple = fuzz.ratio(a, b) * w_simple
partial = fuzz.partial_ratio(a, b) * w_partial
combined = float(simple) * float(partial) / float(10000)
return combined
def findItemName(self, itemDictionary, messageItem):
bestScore = 0
score = 0
bestItem = None
try:
for itemName, itemLabel in itemDictionary.items():
score = fuzz.ratio(messageItem, itemLabel)
if score > bestScore:
bestScore = score
bestItem = itemName
except KeyError:
pass
return bestItem
def tieBreak(self, query, i, j):
"""
????????????????????????????????
Args:
- query: ??????
- i: index ? i ? title
- j: index ? j ? title
Return: (target, index)
- target: ??????
- index : ???? id
"""
raw1 = self.titles[i]
raw2 = self.titles[j]
r1 = fuzz.ratio(query, raw1)
r2 = fuzz.ratio(query, raw2)
if r1 > r2:
return (raw1,i)
else:
return (raw2,j)
gridding.py 文件源码
项目:the-magical-csv-merge-machine
作者: entrepreneur-interet-general
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def score_chars(src, ref):
# Returns a score in [0, 100]
a0 = toASCII(src)
b0 = toASCII(ref)
a1 = acronymizePhrase(a0)
b1 = acronymizePhrase(b0)
if len(a1) > 0 and len(b1) > 0 and (a1 == b0.upper() or a0.upper() == b1):
logging.debug('Accepted for ACRO : {} / {}'.format(a, b))
return 100
a = justCase(src)
b = justCase(ref)
absCharRatio = fuzz.ratio(a, b)
if absCharRatio < 20:
logging.debug('Rejected for ABS : {} / {}'.format(a, b))
return 0
partialCharRatio = fuzz.partial_ratio(a, b)
if partialCharRatio < 30:
logging.debug('Rejected for PARTIAL : {} / {}'.format(a, b))
return 0
return absCharRatio * partialCharRatio / 100
def fuzzy_feats(train_in, test_in, qcolumns = ['question1', 'question2'], append=''):
from fuzzywuzzy import fuzz
import pandas as pd
train = train_in.copy().loc[:,qcolumns]
test = test_in.copy().loc[:,qcolumns]
train['fuzz_r'+append] = train.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
train['fuzz_pr'+append] = train.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
train['fuzz_tsr'+append] = train.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
train['fuzz_tsor'+append] = train.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
test['fuzz_r'+append] = test.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
test['fuzz_pr'+append] = test.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
test['fuzz_tsr'+append] = test.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
test['fuzz_tsor'+append] = test.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
return (train, test)
def best_scoring_value(self, groups):
'''
Finds best fuzzy match
Compares each elem of the group with each keyphrase/word in loc_map
Returns the location with best matching
'''
best_match = ''
best_score = 0
groups = list(groups)
# Append the whole of the group to the things to be checked
# For instance, for the group ('a', 'b'), 'a b' will also be matched
groups.append(' '.join(groups))
for g in groups:
for key in self.loc_map:
if fuzz.ratio(key, g) > best_score:
best_score = fuzz.ratio(key, g)
best_match = self.loc_map[key]
return best_match
def is_eq_arg(x, y):
"""
Return whether these two words are equal, with fuzzy string matching.
:param x: the first argument
:param y: the second argument
:return: Whether they are equal
"""
if fuzz.ratio(x, y) >= 90:
return True
# Convert numbers to words
x_words = [num2words(int(w)).replace('-', ' ') if w.isdigit() else w for w in x.split()]
y_words = [num2words(int(w)).replace('-', ' ') if w.isdigit() else w for w in y.split()]
# Partial entailment with equivalence, e.g. 'two girls' -> 'two kids':
return fuzz.ratio(' '.join(x_words), ' '.join(y_words)) >= 85
def is_eq_preds(p1, p2):
"""
Return whether these two predicates are equal, with fuzzy string matching.
:param x: the first predicate
:param y: the second predicate
:return: Whether they are equal
"""
global nlp
# Levenshtein distance mostly
if fuzz.ratio(p1, p2) >= 90:
return True
# Same verb
if p1.replace('{a0} ', '{a0} be ') == p2 or p1.replace('{a0} ', '{a0} have ') == p2 or \
p2.replace('{a0} ', '{a0} be ') == p1 or p2.replace('{a0} ', '{a0} have ') == p1:
return True
return False
def test_fuzzy_korean_ratio():
"""Test Korean-specific fuzzy search."""
assert fuzz.ratio('?', '?') == 0
assert fuzzy_korean_ratio('?', '?') == 67
assert fuzz.ratio('??', '??') == 0
assert fuzzy_korean_ratio('??', '??') == 67
assert fuzz.ratio('??', '??') == 0
assert fuzzy_korean_ratio('??', '??') == 57
assert fuzz.ratio('??', '??') == 0
assert fuzzy_korean_ratio('??', '??') == 57
assert fuzz.ratio('??', '?????') == 0
assert fuzzy_korean_ratio('??', '?????') == 80
def parseArgs():
argparser = argparse.ArgumentParser(description='This is uploafer. Obviously. If you don\'t know what WM2 is, better not to know what uploafer is.')
#argparser.add_argument('-u', '--username', help='Your PTH username', required=True)
#argparser.add_argument('-p', '--password', help='Your PTH password', required=True)
#argparser.add_argument('-i', '--wm2media', help='The directory containing your WM2 downloads. Each subdirectory should contain a "ReleaseInfo2.txt" file.', default='.', required=True)
#argparser.add_argument('-w', '--wm2root', help='This directory should contain "manage.py". Leave this blank to disable auto-import. Warning: auto-import will MOVE your torrent data!')
#argparser.add_argument('-o', '--output', help='This is the output directory for torrents and media you wish to upload. This option is overridden if wm2root is specified.')
#argparser.add_argument('-z', '--fuzzratio', help='Minimum likeness ratio required to consider a match. Anything which scores higher than this will not be eligible for uploading. Default is 90', type=int, default=90)
argparser.add_argument('-vv', '--debug', help='Highest level of verbosity for debugging', action="store_true")
argparser.add_argument('-v', '--verbose', help='High level of verbosity for detailed info', action="store_true")
argparser.add_argument('-r', '--resume', help="Resume where uploafer left off within the WM2 media directory.", action="store_true")
argparser.add_argument('-a', '--auto', help='Don\'t use this.', action="store_true")
args = argparser.parse_args()
if args.debug:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
log.info("Debug output.")
elif args.verbose:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
log.info("Verbose output.")
else:
log.basicConfig(format="%(levelname)s: %(message)s")
return args
def findBestGroup(ri, artist):
#TODO: Check catalogue numbers!
bestGrp = ri.group #placeholder
bestGrp.match = -1
for group in artist.torrentgroup:
if (ri.group.catalogueNumber != '') and (ri.group.catalogueNumber == group.groupCatalogueNumber):
bestGrp = group
bestGrp.match = 101
break
else:
group.match = fuzz.ratio(ri.group.name, group.groupName)
if group.match > bestGrp.match:
bestGrp = group
if bestGrp.match == 100:
break
return bestGrp
def tieBreak(self, query, i, j):
"""
????????????????????????????????
Args:
- query: ??????
- i: index ? i ? title
- j: index ? j ? title
Return: (target, index)
- target: ??????
- index : ???? id
"""
raw1 = self.titles[i]
raw2 = self.titles[j]
r1 = fuzz.ratio(query, raw1)
r2 = fuzz.ratio(query, raw2)
if r1 > r2:
return (raw1,i)
else:
return (raw2,j)
def match_phrase(self, lineinput, phrases):
scores = []
phrasemap = {}
for phrase in phrases:
phrasemap[phrase['id']] = phrase
for part in phrase['parts']:
pscore={}
pscore['part']=part
pscore['id']=phrase['id']
pscore['score'] = fuzz.ratio(part, lineinput)
scores.append(pscore)
maxscore = max(scores, key=lambda x: x['score'])
# print scores
# print maxscore
return phrasemap[maxscore['id']]
def get_fixture_channels(self, events, fixture):
chann = []
items = []
for item in events:
evnt = item['event']
comp = fuzz.ratio(fixture.competition.name, evnt['competition'])
home = fuzz.ratio(fixture.home_team.name, evnt['home'])
away = fuzz.ratio(fixture.away_team.name, evnt['away'])
comb = (comp + home + away) / 3
items.append({ 'ratio': comb, 'channels': item['channels'] })
if items:
sort = sorted(items, key=itemgetter('ratio'), reverse=True)[0]
if sort['ratio'] > 70:
chann = self.data.get_multiple('channel', 'name', sort['channels'])
chann = [c.id for c in chann]
return chann
def build_similarity(self, actor, other_actor):
similarity = ActorSimilarity(**su.empty_dict(ACTOR_SIMILARITY_FIELDS))
# run comparisons for similarity
similarity.identical = (actor.actor_id == other_actor.actor_id)
similarity.proper_name1 = proper(actor.parsed_name)
similarity.proper_name2 = proper(other_actor.parsed_name)
similarity.proper_email_name1 = proper(actor.parsed_email.parsed_name)
similarity.proper_email_name2 = proper(
other_actor.parsed_email.parsed_name)
similarity.same_name = (actor.parsed_name.name ==
other_actor.parsed_name.name)
similarity.name_ratio = self.compare_names(actor.parsed_name,
other_actor.parsed_name)
similarity.same_email = (actor.parsed_email.email ==
other_actor.parsed_email.email)
similarity.email_domain_ratio = fuzz.ratio(
actor.parsed_email.domain,
other_actor.parsed_email.domain)
similarity.same_email_name = (actor.parsed_email.parsed_name.name ==
other_actor.parsed_email.parsed_name.name)
similarity.email_name_ratio = self.compare_names(
actor.parsed_email.parsed_name,
other_actor.parsed_email.parsed_name)
similarity.name1_email_ratio = self.compare_names(
actor.parsed_name,
other_actor.parsed_email.parsed_name)
similarity.name2_email_ratio = self.compare_names(
actor.parsed_email.parsed_name,
other_actor.parsed_name)
return similarity
def compare_names(name1: ParsedName, name2: ParsedName):
if proper(name1) and proper(name2):
compare = fuzz.token_set_ratio
else:
compare = fuzz.ratio
return compare(name1.name, name2.name)
def fuzzy_distance(word, words):
return sorted(((w, fuzz.ratio(word, w)) for w in words),
key=lambda e: -e[1])
def parse_line(frequency_dict, word_index_dict, nynorsk_line, bokmaal_line):
nn_tokenized = re.findall(r'\w+', nynorsk_line, re.MULTILINE | re.UNICODE)
nb_tokenized = re.findall(r'\w+', bokmaal_line, re.MULTILINE | re.UNICODE)
if (len(nn_tokenized) != len(nb_tokenized)):
# Drop the whole sentence if it doesn't have the same number of tokens.
return
consecutive_skips = 0
for i in range(len(nb_tokenized)):
# If translation fails, the word is prefixed with '*'
if '*' in nb_tokenized[i] or '*' in nn_tokenized[i]:
continue
# If the edit distance ratio is lower than 40 % for three consecutive words,
# we conclude that we have gone astray, and drop the rest of the sentence.
if (fuzz.ratio(nn_tokenized[i], nb_tokenized[i]) < 40):
consecutive_skips += 1
if (consecutive_skips == 3):
break
else:
consecutive_skips = 0
nn_token_idx = get_index_key(word_index_dict, nn_tokenized[i])
nb_token_idx = get_index_key(word_index_dict, nb_tokenized[i])
if (nn_token_idx, nb_token_idx) in frequency_dict:
frequency_dict[(nn_token_idx, nb_token_idx)] += 1
else:
frequency_dict[(nn_token_idx, nb_token_idx)] = 1
def parse_line(frequency_dict, word_index_dict, nynorsk_line, bokmaal_line):
nn_tokenized = re.findall(r'\w+', nynorsk_line, re.MULTILINE | re.UNICODE)
nb_tokenized = re.findall(r'\w+', bokmaal_line, re.MULTILINE | re.UNICODE)
if (len(nn_tokenized) != len(nb_tokenized)):
# Drop the whole sentence if it doesn't have the same number of tokens.
return
consecutive_skips = 0
for i in range(len(nb_tokenized)):
# If translation fails, the word is prefixed with '*'
if '*' in nb_tokenized[i] or '*' in nn_tokenized[i]:
continue
# If the edit distance ratio is lower than 40 % for three consecutive words,
# we conclude that we have gone astray, and drop the rest of the sentence.
if (fuzz.ratio(nn_tokenized[i], nb_tokenized[i]) < 40):
consecutive_skips += 1
if (consecutive_skips == 3):
break
else:
consecutive_skips = 0
nn_token_idx = get_index_key(word_index_dict, nn_tokenized[i])
nb_token_idx = get_index_key(word_index_dict, nb_tokenized[i])
if (nn_token_idx, nb_token_idx) in frequency_dict:
frequency_dict[(nn_token_idx, nb_token_idx)] += 1
else:
frequency_dict[(nn_token_idx, nb_token_idx)] = 1
def filterModule(self, module):
ratio = 0
compatibleType = False
if "type" in self.modfilter:
if self.modfilter["type"]["dir"] == "input":
for input in module.inputDefs:
if input.pintype == self.modfilter["type"]["type"]:
compatibleType = True
break
elif self.modfilter["type"]["dir"] == "output":
for output in module.outputDefs:
if output.pintype == self.modfilter["type"]["type"]:
compatibleType = True
break
if not compatibleType:
return False
if "text" in self.modfilter: # Filter by text input
if self.modfilter["text"] in module.name:
return True
if not self.modfilter["text"]: # Text entry is empty
return True
ratio = fuzz.ratio(self.modfilter["text"], module.name)
ratio = max(ratio, fuzz.partial_ratio(self.modfilter["text"], module.desc))
else:
return True # Don't filter by text? Return all remaining
if ratio > 40:
return True
else:
return False
def get_unknown(topic):
topics_list = get_topics_list()
if topic.startswith(':'):
topics_list = [x for x in topics_list if x.startswith(':')]
else:
topics_list = [x for x in topics_list if not x.startswith(':')]
possible_topics = process.extract(topic, topics_list, scorer=fuzz.ratio)[:3]
possible_topics_text = "\n".join([(" * %s %s" % x) for x in possible_topics])
return """
Unknown topic.
Do you mean one of these topics may be?
%s
""" % possible_topics_text
def find_entity(self, entity, types):
if self.ssl:
req = get("%s/api/states" %
self.url, headers=self.headers, verify=self.verify)
else:
req = get("%s/api/states" % self.url, headers=self.headers)
if req.status_code == 200:
best_score = 0
best_entity = None
for state in req.json():
try:
if state['entity_id'].split(".")[0] in types:
score = fuzz.ratio(
entity,
state['attributes']['friendly_name'].lower())
if score > best_score:
best_score = score
best_entity = {
"id": state['entity_id'],
"dev_name": state['attributes']
['friendly_name'],
"state": state['state']}
except KeyError:
pass
return best_entity
#
# checking the entity attributes to be used in the response dialog.
#
def match(self, query):
"""
????? query???????????????????????
Args:
- query: ?????????
- removeStopWords: ?? stopwords
"""
ratio = -1
target = ""
target_idx = -1
if self.cleanStopWords:
mQuery = [word for word in self.wordSegmentation(query)
if word not in self.stopwords]
mQuery = "".join(mQuery)
title_list = self.segTitles
else:
title_list = self.titles
mQuery = query
for index,title in enumerate(title_list):
newRatio = fuzz.ratio(mQuery, title)
if newRatio > ratio:
ratio = newRatio
target = title
target_idx = index
elif self.cleanStopWords and newRatio == ratio:
target, target_idx = self.tieBreak(query,target_idx,index)
self.similarity = ratio
return target,target_idx
def get(cls, name):
mon = cls.MONSTER_D.get(name.strip().lower())
if mon:
return mon
mons = []
for mon in cls.MONSTERS:
ratio = fuzz.ratio(mon.name.lower().strip(), name)
mons.append((ratio, mon))
mons = [b for a, b in sorted(mons, key=lambda x: x[0], reverse=True)]
return mons[0]
def match_contractors(contractors_file, match_file, match_col, match_threshold):
results = []
with open(match_file, 'r') as f:
with open(contractors_file, 'r') as g:
contracts = []
contribs_reader = csv.reader(f)
contracts_reader = csv.reader(g)
next(contracts_reader)
for row in contracts_reader:
contracts.append(row)
header = next(contribs_reader)
for row in contribs_reader:
best_match = ''
best_match_amount = -1
best_score = 0
for contract in contracts:
translator = str.maketrans('', '', string.punctuation)
contractor_name = contract[0].translate(translator).lower()
match_name = row[match_col].translate(translator).lower()
score = fuzz.ratio(match_name, contractor_name)
if score > best_score and score > match_threshold:
best_match = contract[0]
best_score = score
best_match_amount = contract[4]
new_row = row + [best_match, best_match_amount]
results.append(new_row)
return results
def __process_loc_results__(self, results, label):
"""Method takes the json results from running the
Args:
results(list): List of JSON rows from LOC ID call
label(str): Original Label
"""
title, loc_uri, term_weights = None, None, dict()
for row in results:
if isinstance(row, dict) or not row[0].startswith('atom:entry'):
continue
if row[2][0].startswith("atom:title"):
title = row[2][-1]
if row[3][0].startswith("atom:link"):
loc_url = row[3][-1].get('href')
if "subjects/" in loc_url:
bf_class = BF.Topic
elif "organizations/" in loc_url:
bf_class = BF.Organization
else:
bf_class = BF.Agent
loc_uri = rdflib.URIRef(loc_url)
term_weights[str(loc_uri)] = {
"weight": fuzz.ratio(label, title),
"class": bf_class,
"title": title}
results = sorted(term_weights.items(), key=lambda x: x[1]['weight'])
results.reverse()
for row in results:
loc_url = row[0]
weight = row[1].get('weight')
title = row[1].get('title')
if weight >= self.cutoff:
return rdflib.URIRef(loc_url), rdflib.Literal(title)
return None, None
preprocess_fields_v3.py 文件源码
项目:the-magical-csv-merge-machine
作者: entrepreneur-interet-general
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def address_filter_score(src, ref):
a1, a2 = case_phrase(src), case_phrase(ref)
return fuzz.partial_ratio(a1, a2) + fuzz.ratio(a1, a2)
# Acronym handling
def check_answer(self, message, match):
answer = match.group("answer")
print answer
if self.active_question_bool:
if fuzz.ratio((answer.lower()), (self.data[self.active_index]['answer'].lower())) >= self.fuzziness_ratio:
self.active_question_bool = False
self.active_index = 0
self.active_question = ""
name = self.nombre(message.getParticipant())
return TextMessageProtocolEntity("Correct " + name + "!", to=message.getFrom())
else:
name = self.nombre(message.getParticipant())
return TextMessageProtocolEntity("Incorrect " + name + "!", to=message.getFrom())
def percentDiff(old, new):
x = fuzz.ratio(old, new)
return x