def match_elements(self, text1, text2):
"""
utility function to match two strings, makes use of
match config initiated in __init__
returns the output as confidence score of flexible match
"""
conf = 0
if self.m_config['exact']:
if text1 == text2:
conf += 1
if self.m_config['levenshtein']:
conf += ratio(text1, text2)
if self.m_config['soundex']:
if soundex(text1) == soundex(text2):
conf += 1
if self.m_config['nysiis']:
if fuzzy.nysiis(text1) == fuzzy.nysiis(text2):
conf += 1
return conf
python类ratio()的实例源码
def getVec(kb, id1, id2):
if kb == 'bh':
title1, context1, category1 = getMsgbyId('baidu', id1)
title2, context2, category2 = getMsgbyId('hudong', id2)
if kb == 'bw':
title1, context1, category1 = getMsgbyId('baidu', id1)
title2, context2, category2 = getMsgbyId('wiki', id2)
if kb == 'hw':
title1, context1, category1 = getMsgbyId('hudong', id1)
title2, context2, category2 = getMsgbyId('wiki', id2)
title_r = Levenshtein.ratio(title1, title2)
context_r = cosine(context1, context2)
category_r = sameCategory(category1, category2)
return (title_r, context_r, category_r, 0.0)
def frame_similarity(frame1,frame2):
similarity = 1
if 'Type' in frame1:
if frame1['Type'] != frame2['Type']:
similarity = 0.0
if similarity == 1:
if 'PlaceMention' in frame1:
# if PlaceMention is normalized use simple string comparison
if not Levenshtein_arg:
if frame1['PlaceMention'] != frame2['PlaceMention']:
similarity = 0.0
else:
# PlaceMention is not normalized so use Levinshtein distance
similarity = Levenshtein.ratio(frame1['PlaceMention'], frame2['PlaceMention'])
#print("similarity: ", similarity)
return similarity
# evaluate at the document level -----------------------------------------------
def get_message_change_ratio(status_update):
"""Expects a status update instance, returns a number representing
how much a message has been edited (1.0 completely changed, 0.0 unchanged)
based on Levenshtein ratio.
If a status update has no associated notification, returns None
https://github.com/ztane/python-Levenshtein
"""
if hasattr(status_update, 'notification'):
author_profile = status_update.author.profile
intro_text = get_notification_intro(author_profile) + '\n\n'
return 1.0 - Levenshtein.ratio(
*[message.replace(intro_text, '')
for message in (
status_update.notification.base_message,
status_update.notification.sent_message)])
else:
return None
def getCandidatesForLemma(lemma, min_size, max_size):
hits=[]
for match in ["phrase", "conjunct"]:
url="http://lotus.lodlaundromat.org/retrieve?size=" + str(max_size) + "&match=" + match + "&rank=psf&noblank=true&" + urllib.parse.urlencode({"string": lemma, "predicate": "label", "subject": "\"http://dbpedia.org/resource\""})
r = requests.get(url=url)
content = r.json()
these_hits=content["hits"]
hits=hits + these_hits
if content["numhits"]>=min_size or len(lemma.split(' '))==1:
break
subjects={}
for hit in hits:
lev_sim=Levenshtein.ratio(hit["string"].lower(), lemma.lower())
if "Disambiguation" not in hit["subject"].lower() and "Category" not in hit["subject"]:
if hit["subject"] not in subjects:
#subjects[hit["subject"]]=hit["length"]*len(lemma.split())
subjects[hit["subject"]]={"ss": lev_sim, "count": 1}
else:
subjects[hit["subject"]]["ss"]=max(subjects[hit["subject"]]["ss"], lev_sim)
subjects[hit["subject"]]["count"]+=1
return subjects
def mostCommon(spoken, lst, threshold):
highestCountItem = max(lst, key=lst.count)
highestCount = lst.count(highestCountItem)
contenders = []
for item in lst:
if (lst.count(item) == highestCount) and (item not in contenders):
contenders.append(item)
if len(contenders) > 1:
print "\nContending"
bestMatch = [None, 0]
for ayah in contenders:
score = ratio(spoken, ayah)
print ayah
print score
if score > threshold and score > bestMatch[1]:
bestMatch = [ayah, score]
return bestMatch[0]
elif ratio(spoken, highestCountItem) > threshold:
return highestCountItem
else:
return None
# Takes in a query and list of matches
# Returns the match with the highest similarity to the query
def print_matched_groups(extracted_combo_lst):
dst_dct = {}
for itm in extracted_combo_lst:
dst_dct.setdefault(itm, [])
if len(extracted_combo_lst) == 1: break
match_dct = {}
for i in range(len(extracted_combo_lst)):
if extracted_combo_lst[i] == itm: continue
dst = Levenshtein.ratio(itm, extracted_combo_lst[i])
match_dct[extracted_combo_lst[i]] = dst
sorted_match_lst = sorted(match_dct.items(), key = operator.itemgetter(1), reverse = True)
top_n = 2
dst_dct[itm] = [e[0] for e in sorted_match_lst[0:top_n]]
extracted_combo_lst.remove(itm)
for e in dst_dct[itm]:
extracted_combo_lst.remove(e)
for k, v in dst_dct.items():
print k, v
print
def should_run(self):
data = self.item_options.get('compare_url')
if data:
if isinstance(data, Dict):
self.fuzzy = data.get('fuzzy', 1.0)
self.url2 = data.get('url')
if not self.url2:
logger.debug('compare_url must contain a url')
return False
else:
logger.debug('compare_url must be a nested dictionary containing url and ratio properties')
return False
return True
return False
def ratio(self):
if not self._ratio:
self._ratio = ratio(self._str1, self._str2)
return self._ratio
def quick_ratio(self):
# This is usually quick enough :o)
if not self._ratio:
self._ratio = ratio(self._str1, self._str2)
return self._ratio
def getCandidates(self, mention, threshold=0.7):
res = []
# ??title?
for id, title, link_count in self.db_titles:
m_score = Levenshtein.ratio(title, mention)
if m_score > threshold:
self.cur.execute("select abstract from abstract where id = %s"%id)
context = self.cur.fetchall()
if context != ():
context = json.loads(context[0][0])
RE = []
self.cur.execute("select to_id from link where from_id = %s"%id)
linkto_ids = self.cur.fetchall()
if linkto_ids != ():
for to_id in linkto_ids:
RE.append(to_id[0])
res.append(Entity(title, id, m_score, context, link_count, RE))
# ??disambiguation?
for id, title, dis_context, link_count in self.db_disambiguations:
m_score = Levenshtein.ratio(title, mention)
if m_score > threshold:
title += '[%s]'%dis_context
self.cur.execute("select abstract from abstract where id = %s"%id)
context = self.cur.fetchall()
if context != ():
context = json.loads(context[0][0])
context.append(dis_context)
RE = []
self.cur.execute("select to_id from link where from_id = %s"%id)
linkto_ids = self.cur.fetchall()
if linkto_ids != ():
for to_id in linkto_ids:
RE.append(to_id[0])
res.append(Entity(title, id, m_score, context, link_count, RE))
return res
def getCandidates(self, mention, threshold=0.7):
res = []
# ??title?
for id, title, link_count in self.db_titles:
m_score = Levenshtein.ratio(title, mention)
if m_score > threshold:
self.cur.execute("select abstract from abstract where id = %s"%id)
context = self.cur.fetchall()
if context != ():
context = json.loads(context[0][0])
RE = []
self.cur.execute("select to_id from link where from_id = %s"%id)
linkto_ids = self.cur.fetchall()
if linkto_ids != ():
for to_id in linkto_ids:
RE.append(to_id[0])
res.append(Entity(title, id, m_score, context, link_count, RE))
# ??disambiguation?
for id, title, dis_context, link_count in self.db_disambiguations:
m_score = Levenshtein.ratio(title, mention)
if m_score > threshold:
title += '[%s]'%dis_context
self.cur.execute("select abstract from abstract where id = %s"%id)
context = self.cur.fetchall()
if context != ():
context = json.loads(context[0][0])
context.append(dis_context)
RE = []
self.cur.execute("select to_id from link where from_id = %s"%id)
linkto_ids = self.cur.fetchall()
if linkto_ids != ():
for to_id in linkto_ids:
RE.append(to_id[0])
res.append(Entity(title, id, m_score, context, link_count, RE))
return res
def frame_similarity(frame1,frame2):
similarity = 1
if 'Type' in frame1:
if frame1['Type'] != frame2['Type']:
similarity = 0
if similarity == 1:
if 'PlaceMention' in frame1:
similarity = Levenshtein.ratio(frame1['PlaceMention'], frame2['PlaceMention'])
return similarity
# evaluate at the document level -----------------------------------------------
def get_name_similarity_ratio(a, b):
names = (get_full_lowercase_name(sub) for sub in (a, b))
return Levenshtein.ratio(*names)
def interesting_party(*a, **k):
while True:
while True:
phrase = get_name()
if len(phrase) < 100:
break
steps = party(phrase, *a, **k)
result = steps[-1][-1]
if ratio(phrase.lower(), result.lower()) < 0.7:
return steps
def play(self, guess):
return ratio(normalise(guess), normalise(self.original))
def moreLocalCandidates(m, previous, candidates):
for pm, pl in previous.items():
if is_abbrev(m, pm):
for prevLink in previous[pm]:
prevLinkDB=utils.makeDbpedia(prevLink)
candidates.append(tuple([prevLinkDB, {"ss": 1.0, "count": 0.0}]))
elif isEnoughSubset(m, pm):
for prevLink in previous[pm]:
prevLinkDB=utils.makeDbpedia(prevLink)
candidates.append(tuple([prevLinkDB, {"ss": Levenshtein.ratio(m.lower(), pm.lower()), "count": 0.0}]))
return candidates
def bestLevMatch(spoken, lst):
print " "
bestMatch = [None, 0.65]
for ayah in lst:
score = ratio(spoken, ayah)
print ayah
print score
if score > bestMatch[1]:
bestMatch = [ayah, score]
return bestMatch[0]
# Takes in an ayah object from alfanous
# Returns a cleaned-up ayah object
def checkForWordInQuran(value):
wordMatch = dbGet(models.QuranWord, value)
if wordMatch:
return wordMatch.text
else:
# The original word is not in the Quran so we try alfanous' suggestions
wordSuggestionList = []
wordSuggestions = alfanous.do({
"action": "suggest", "query": value
})["suggest"]
for word in wordSuggestions:
for suggestion in wordSuggestions[word]:
wordMatch = dbGet(models.QuranWord, value)
if wordMatch:
wordSuggestionList.append(wordMatch.text)
if len(wordSuggestionList) > 1:
topRatioValue = 0
topSuggestion = ""
while len(wordSuggestionList) > 0:
suggestion = wordSuggestionList.pop(0)
suggestionRatio = ratio(value, suggestion)
if suggestionRatio > topRatioValue:
topRatioValue = suggestionRatio
topSuggestion = suggestion
return topSuggestion
elif len(wordSuggestionList) == 1:
return wordSuggestionList[0]
else:
return None
# Takes in a query and checks if any part of it is in the Quran
# Return the part in the Quran if one is found, otherwise it returns None
def similarity_ratio(x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD):
"""Compute the similarity ratio between two strings.
If the ratio exceeds the threshold, return it; otherwise, return 0.
The similarity ratio is given by
1 - (levenshtein distance with substitution cost = 2) / (total length)
"""
ratio = Levenshtein.ratio(x, y)
return ratio if ratio > threshold else 0.
################################
# NERValueGenerator
def getLevenshteinDistance(item, keyword):
item = item.lower().replace(' ', '').replace('-', '').replace('_', '')
keyword = keyword.lower().replace(' ', '').replace('-', '').replace('_', '')
return Levenshtein.ratio(item, keyword)
def generateStemmingDict(inputPath = 'stemmer.txt', outputPath = 'stemmingDict'):
inputEncoding = 'utf8'
outputEncoding = 'utf8'
distance = Levenshtein.ratio
fi = open(inputPath, 'r', encoding=inputEncoding)
fo = open(outputPath, 'w', encoding=outputEncoding)
stemmingDict = {}
for line in fi:
if line.strip() == '':
continue
tmpList = line.strip().split(' => ')
for word in tmpList[0].split(', '):
if word not in stemmingDict:
stemmingDict[word] = set()
stemmingDict[word].add(tmpList[1])
for key in stemmingDict:
stemmingDict[key] = list(stemmingDict[key])
for i in range(len(stemmingDict[key])):
stemmingDict[key][i] = [stemmingDict[key][i],distance(stemmingDict[key][i],key)]
json.dump(stemmingDict,fo)
fi.close()
fo.close()
fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding)
for key in stemmingDict:
fotxt.write(key + ' ' + str(stemmingDict[key]) + '\n')
fotxt.close()
def get_features(df_features):
print('use w2v to document presentation')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
print('nones')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
#df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
#df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v_nones')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
del df_features['question1_w2v']
del df_features['question2_w2v']
print('all done')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features
def get_features(df_features):
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_w2v'] = df_features.apply(lambda x: w2v_cos_sim(x['question1'], x['question2']), axis=1)
return df_features
def get_registrar(
cls,
subject,
):
cls.check_and_update_registrars()
edited_subject = re.sub(
pattern='[^\d\w]',
repl='',
string=subject,
)
edited_subject = edited_subject.lower()
for registrar in cls.registrars:
if edited_subject in registrar['edited'].lower():
return registrar['original']
most_close_registrar = ''
most_close_registrar_distance_ratio = 0
for registrar in cls.registrars:
registrar_distance_ratio = Levenshtein.ratio(
edited_subject,
registrar['edited'],
)
if registrar_distance_ratio > most_close_registrar_distance_ratio:
most_close_registrar = registrar['original']
most_close_registrar_distance_ratio = registrar_distance_ratio
return most_close_registrar
def adjective_fuzzy_matching(token, adjectives, match):
"""
Given a token and a list of terms to match, returns True if
the stem of the token matches any of the items in the list.
Input:
token: Token object to match
adjectives: list of items to match the Token
match: minimum ratio (0-100) for matching
"""
for adjective in adjectives:
if Levenshtein.ratio(str(token.stem), str(adjective)) >= match:
return True
return False
def _transactions_fuzzy_matching(transactions, match):
"""
Runs fuzzy matching on the transactions, by applying a complete linkage
hierarchical clustering algorithm to the set of different itemsets in the
transactions. For clustering, the similarity ratio as given by
fuzzywuzzy.ratio is used as the distance measure
Input:
transactions: list of tuples representing items on each transaction
match: minimum similarity ratio (0 to 100) for clustering
Output:
transactions: new version of the transactions, where each item has been
replaced by the first item on its corresponding cluster
word_clusters: dictionary that maps the cluster for each item
in the transactions
"""
words = set([])
for transaction in transactions:
words |= set(transaction)
words = sorted(words)
l = [((a, b), 100-Levenshtein.ratio(str(a), str(b)))
for a, b in combinations(words, 2)]
d = [value for pair, value in l]
r = linkage(d, 'complete')
clusters_index = fcluster(r, 100-match, "distance")
clusters = {}
for obs_i, cluster_i in enumerate(clusters_index):
if cluster_i in clusters:
clusters[cluster_i].append(words[obs_i])
else:
clusters[cluster_i] = [words[obs_i]]
word_clusters = {word: clusters[clusters_index[i]]
for i, word in enumerate(words)}
new_transactions = []
for transaction in transactions:
new_transaction = tuple(set(([word_clusters[word][0]
for word in transaction])))
new_transactions.append(new_transaction)
return new_transactions, word_clusters
def __init__(self, match=90, key=lambda x: x.string.lower()):
"""
Fuzzy matching between the given token and term objects. For comparison
applies the function given in the "key" parameter to the Token/tuple
of Tokens. Parameter match defines the minimum similarity ratio for
a match when comparing.
Input:
match : minimum similarity for fuzzy matching (%)
key : function to apply to the token,
default=lambda x: x.string.lower()
"""
self.match = match
self.key = key
def __call__(self, token_tuple, terms):
"""
Input:
token_tuple : Token or tuple of Token objects
terms : term or iterable of terms to match
Output:
Returns None if no match is found.
Returns the first matched in case many of them show the same
similarity ratio.
"""
if not hasattr(terms, '__iter__'):
terms = [terms]
if not isinstance(token_tuple, tuple):
token_tuple = (token_tuple,)
try:
token_tuple = tuple(self.key(token) for token in token_tuple)
except Exception: # as e
token_tuple = tuple(str(token) for token in token_tuple)
best_term = None
best_ratio = 0
for term in terms:
ratio = max([Levenshtein.ratio(unicode(" ".join(token_tuple)),
unicode(" ".join(term_i)))*100
for term_i in term])
if ratio >= self.match and ratio > best_ratio:
best_term = term
best_ratio = ratio
return best_term
# ------- UTIL FUNCTIONS ------------------------------------------------------
def _edit_dist(str1, str2):
try:
# very fast
# http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
# d = Levenshtein.ratio(str1, str2)
d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
except:
# https://docs.python.org/2/library/difflib.html
d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
return d