def _edit_dist(str1, str2):
try:
# very fast
# http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
# d = Levenshtein.ratio(str1, str2)
d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
except:
# https://docs.python.org/2/library/difflib.html
d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
return d
python类ratio()的实例源码
def similar_link_visited(link_url, links, fuzzy):
for link in links:
if ratio(link_url, link) >= fuzzy:
# Link already accessed, return
return True
return False
def check(self):
headers = self.item_options.get('headers', {})
cookies = self.item_options.get('cookies', {})
username = self.global_options.get('username')
password = self.global_options.get('password')
r2 = requests.get(self.url2, headers=headers,
auth=HTTPBasicAuth(username, password), allow_redirects=True, cookies=cookies)
logger.info("Comparing urls...")
if self.fuzzy == 1.0:
self.ok(self.response.text == r2.text,
'Urls don\'t have equal content: {tested} and {reference}'.format(tested=self.url,
reference=self.url2))
else:
actual_ratio = ratio(self.response.text, r2.text)
self.ok(actual_ratio > self.fuzzy,
"""
Urls don\'t have sufficiently similar content: {tested} and {reference} (expected {expected}, got {actual})
"""
.format(
tested=self.url,
reference=self.url2,
expected=self.fuzzy,
actual=actual_ratio))
return self.is_ok()
def set_levenshtein(self):
'''
Mean and max Levenshtein ratio for all labels.
'''
if not [f for f in self.features if f.startswith('match_str_lsr')]:
return
ne = self.cluster.entities[0].norm
# Pref label
l = self.document.get('pref_label')
self.match_str_lsr_pref = Levenshtein.ratio(ne, l)
# Wikidata alt labels
if self.document.get('wd_alt_label'):
wd_labels = self.document.get('wd_alt_label')
ratios = [Levenshtein.ratio(ne, l) for l in wd_labels]
self.match_str_lsr_wd_max = max(ratios) - 0.5
self.match_str_lsr_wd_mean = (sum(ratios) /
float(len(wd_labels))) - 0.375
else:
wd_labels = []
# Any other alt labels
if self.document.get('alt_label'):
labels = self.document.get('alt_label')
labels = [l for l in labels if l not in wd_labels]
if labels:
ratios = [Levenshtein.ratio(ne, l) for l in labels]
self.match_str_lsr_alt_max = max(ratios) - 0.5
self.match_str_lsr_alt_mean = (sum(ratios) /
float(len(labels))) - 0.375
def appendWordNetStemmingDict(inputPath='stemmingDict.old', outputPath='stemmingDict',outputEncoding='utf8'):
oldDict = json.load(open(inputPath,'r',encoding='utf8'))
distance = Levenshtein.ratio
fi = open('wordnet.map','r',encoding='utf8')
fo = open(outputPath,'w',encoding='utf8')
for m in list(oldDict):
tmp = set()
for l in list(oldDict[m]):
tmp.add(l[0])
oldDict[m] = set(tmp)
for line in fi:
m = line.strip().split(' ')
if len(m) == 0:
continue
if m[0] not in oldDict:
oldDict[m[0]]=set()
oldDict[m[0]].add(m[1])
for m in list(oldDict):
oldDict[m] = list(oldDict[m])
for i in range(len(oldDict[m])):
if type(oldDict[m][i]) != str or type(m) != str:
print(oldDict[m])
input()
continue
oldDict[m][i] = [oldDict[m][i],distance(oldDict[m][i],m)]
json.dump(oldDict,fo)
fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding)
for key in oldDict:
fotxt.write(key + ' ' + str(oldDict[key]) + '\n')
fotxt.close()
##
##print('Dumping stemming mpping to json format......')
##generateStemmingDict()
##appendWordNetStemmingDict()
##print('Done!')
def calScoreSub(self, countCharDict):
distance = Levenshtein.ratio
q = self.qRaw
scoreSub = 0
sub = ''
if type(self.sub) == str:
sub = self.sub
subSplit = sub.split(' ')
if sub in q:
for w in subSplit:
if w in countCharDict:
scoreSub += 1/(countCharDict[w] + 1)
else:
scoreSub += 1
else:
subSet = set(subSplit)
qSet = set(q.split(' '))
for w in (subSet & qSet):
if w in countCharDict:
scoreSub += 1/(countCharDict[w] + 1)
else:
scoreSub += 1
if len(subSet) != 0:
scoreSub = scoreSub/len(subSet)
if type(self.sub) == list:
for s in self.sub[0]:
sub += s + ' '
sub = sub.strip()
if type(self.sub) == list:
if len(self.sub[0]) == len(self.sub[1]):
lenSub = len(self.sub[0])
for i in range(lenSub):
w = self.sub[0][i]
wC = self.sub[1][i]
if w in countCharDict:
scoreSub += 1/(countCharDict[w] + 1)*distance(w,wC)
else:
scoreSub += 1*distance(w,wC)
scoreSub = scoreSub / lenSub
else:
subIntersaction = set(self.sub[0]) & set(self.sub[1])
scoreSub = len(subIntersaction) / len(set(self.sub[0]) | set(self.sub[1]))
self.scoreSub = scoreSub
return scoreSub
def calScorePreLast(self, countCharDict,qWithoutSubSet,stemmingDict):
distance = Levenshtein.ratio
pre = self.pre
scorePre = 0
lastPreIndex = pre.rfind('.')
if lastPreIndex != -1:
preLowerSet = set(re.split(r' ',pre[lastPreIndex+1:]))
else:
preLowerSet = set(re.split(r' ',pre))
preLower = list(preLowerSet)
preLowerSet = set()
for i in range(len(preLower)):
if preLower[i] in stemmingDict:
preLower[i] = stemmingDict[preLower[i]][0][0]
preLowerSet.add(preLower[i])
maxIntersection = qWithoutSubSet & preLowerSet
preFactor = 0
for char in maxIntersection:
if char in countCharDict:
preFactor += 1/(countCharDict[char] + 1)
else:
preFactor += 1
if len(maxIntersection) == 0:
for w1 in qWithoutSubSet:
for w2 in preLowerSet:
if w1 == '' or w2 == '' or w1[0] != w2[0]:
continue
div = 1
if w1 in countCharDict:
div = countCharDict[w1] + 1
dWord = distance(w1,w2) / div
if preFactor < dWord:
preFactor = dWord
if len(pre) != 0:
scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
else:
scorePre = 0
self.scorePreLast = scorePre
return scorePre
def calScorePreAll(self, countCharDict, qWithoutSubSet,stemmingDict):
distance = Levenshtein.ratio
pre = self.pre
scorePre = 0
preLowerSet = set(re.split(r' |\.',pre))
preLower = list(preLowerSet)
preLowerSet = set()
for i in range(len(preLower)):
if preLower[i] in stemmingDict:
preLower[i] = stemmingDict[preLower[i]][0][0]
preLowerSet.add(preLower[i])
maxIntersection = qWithoutSubSet & preLowerSet
preFactor = 0
for char in maxIntersection:
if char in countCharDict:
preFactor += 1/(countCharDict[char] + 1)
else:
preFactor += 1
if len(maxIntersection) == 0:
for w1 in qWithoutSubSet:
for w2 in preLowerSet:
if w1 == '' or w2 == '' or w1[0] != w2[0]:
continue
div = 1
if w1 in countCharDict:
div = countCharDict[w1] + 1
dWord = distance(w1,w2) / div
if preFactor < dWord:
preFactor = dWord
if len(pre) != 0:
scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
else:
scorePre = 0
self.scorePreAll = scorePre
return scorePre
def calScorePreLast(self, countCharDict,qWithoutSubSet,stemmingDict):
distance = Levenshtein.ratio
pre = self.pre
scorePre = 0
lastPreIndex = pre.rfind('.')
if lastPreIndex != -1:
preLowerSet = set(re.split(r' ',pre[lastPreIndex+1:]))
else:
preLowerSet = set(re.split(r' ',pre))
preLower = list(preLowerSet)
preLowerSet = set()
for i in range(len(preLower)):
if preLower[i] in stemmingDict:
preLower[i] = stemmingDict[preLower[i]][0][0]
preLowerSet.add(preLower[i])
maxIntersection = qWithoutSubSet & preLowerSet
preFactor = 0
for char in maxIntersection:
if char in countCharDict:
preFactor += 1/(countCharDict[char] + 1)
else:
preFactor += 1
if len(maxIntersection) == 0:
for w1 in qWithoutSubSet:
for w2 in preLowerSet:
if w1 == '' or w2 == '' or w1[0] != w2[0]:
continue
div = 1
if w1 in countCharDict:
div = countCharDict[w1] + 1
dWord = distance(w1,w2) / div
if preFactor < dWord:
preFactor = dWord
if len(pre) != 0:
scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
else:
scorePre = 0
self.scorePreLast = scorePre
return scorePre
def calScorePreAll(self, countCharDict, qWithoutSubSet,stemmingDict):
distance = Levenshtein.ratio
pre = self.pre
scorePre = 0
preLowerSet = set(re.split(r' |\.',pre))
preLower = list(preLowerSet)
preLowerSet = set()
for i in range(len(preLower)):
if preLower[i] in stemmingDict:
preLower[i] = stemmingDict[preLower[i]][0][0]
preLowerSet.add(preLower[i])
maxIntersection = qWithoutSubSet & preLowerSet
preFactor = 0
for char in maxIntersection:
if char in countCharDict:
preFactor += 1/(countCharDict[char] + 1)
else:
preFactor += 1
if len(maxIntersection) == 0:
for w1 in qWithoutSubSet:
for w2 in preLowerSet:
if w1 == '' or w2 == '' or w1[0] != w2[0]:
continue
div = 1
if w1 in countCharDict:
div = countCharDict[w1] + 1
dWord = distance(w1,w2) / div
if preFactor < dWord:
preFactor = dWord
if len(pre) != 0:
scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
else:
scorePre = 0
self.scorePreAll = scorePre
return scorePre
def get_features(df_features):
print('use w2v to document presentation')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1)
print('get_w2v')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x)))
df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x)))
df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x)))
df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x)))
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v_calc')
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1)
df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1)
df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1)
df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x))
df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x))
df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x))
df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x))
df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x))
df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x))
df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x))
df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x))
del df_features['q1_unique_w2v_weight']
del df_features['q2_unique_w2v_weight']
del df_features['q1_unique_w2v']
del df_features['q2_unique_w2v']
print('all done')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features
def __init__(self,
stopwords=NLTKStopwords(),
min_support=MIN_SUPPORT,
max_words=MAX_WORDS,
min_psupport=MIN_PSUPPORT,
min_compact_support=MIN_COMPACT_SUPPORT,
max_compact_distance=MAX_COMPACT_DISTANCE,
adj_key=StemKey(),
adj_win_size=ADJ_NEARBY_DISTANCE ,
match=85,
compactness=True,
redundancy=True,
infrequent=True):
"""
Model to extract aspects using the algorithm by Hu et al. (2004)
stopwords : iterable of strings to use as stopwords
min_support : int, minimum support of an item set
(positive: percentage, negative: absolute
number of transactions)
min_compact_support : int minimum number of compact sentences
of an aspect
max_words : int, maximum number of word on each aspect,
max_compact_distance : int, maximum distance between consecutive
words in an aspect
adj_win_size : int, maximum distance to look for
adjectives near an aspect on a sentence
min_psupport : int, minimum pure support of an aspect
adj_key : lambda function to extract adjectives
match : int, minimum similarity ratio (0-100] for
matching (use <100 for fuzzy) default=
compactness : boolean, True to run "compactness pruning"
redundancy : boolean, True to run "redundancy pruning"
infrequent : boolean, True to also extract infrequent
aspects
"""
self.params = {"stopwords": stopwords,
"min_support": min_support,
"max_words": max_words,
"min_psupport": min_psupport,
"min_compact_support": min_compact_support,
"max_compact_distance": max_compact_distance,
"adj_key": adj_key,
"adj_win_size": adj_win_size,
"match": match,
"compactness": compactness,
"redundancy": redundancy,
"infrequent": infrequent}
def map_discipl(self,invalue,disctab):
"""
Convert disciplines along B2FIND disciplinary list
Copyright (C) 2014 Heinrich Widmann
Licensed under AGPLv3.
"""
retval=list()
if type(invalue) is not list :
inlist=re.split(r'[;&\s]\s*',invalue)
inlist.append(invalue)
else:
seplist=[re.split(r"[;&]",i) for i in invalue]
swlist=[re.findall(r"[\w']+",i) for i in invalue]
inlist=swlist+seplist
inlist=[item for sublist in inlist for item in sublist]
for indisc in inlist :
##indisc=indisc.encode('ascii','ignore').capitalize()
indisc=indisc.encode('utf8').replace('\n',' ').replace('\r',' ').strip().title()
maxr=0.0
maxdisc=''
for line in disctab :
try:
disc=line[2].strip()
r=lvs.ratio(indisc,disc)
except Exception as e:
logging.error('[ERROR] %s in map_discipl : %s can not compared to %s !' % (e,indisc,disc))
continue
if r > maxr :
maxdisc=disc
maxr=r
##HEW-T print('--- %s \n|%s|%s| %f | %f' % (line,indisc,disc,r,maxr)
if maxr == 1 and indisc == maxdisc :
logging.debug(' | Perfect match of %s : nothing to do' % indisc)
retval.append(indisc.strip())
elif maxr > 0.90 :
logging.debug(' | Similarity ratio %f is > 0.90 : replace value >>%s<< with best match --> %s' % (maxr,indisc,maxdisc))
##return maxdisc
retval.append(indisc.strip())
else:
logging.debug(' | Similarity ratio %f is < 0.90 compare value >>%s<< and discipline >>%s<<' % (maxr,indisc,maxdisc))
continue
if len(retval) > 0:
retval=list(OrderedDict.fromkeys(retval)) ## this elemenates real duplicates
return ';'.join(retval)
else:
return 'Not stated'