python类ratio()的实例源码

dist_utils.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d
main.py 文件源码 项目:phat 作者: danielfranca 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def similar_link_visited(link_url, links, fuzzy):
        for link in links:
            if ratio(link_url, link) >= fuzzy:
                # Link already accessed, return
                return True
        return False
plugin.py 文件源码 项目:phat 作者: danielfranca 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def check(self):
        headers = self.item_options.get('headers', {})
        cookies = self.item_options.get('cookies', {})
        username = self.global_options.get('username')
        password = self.global_options.get('password')

        r2 = requests.get(self.url2, headers=headers,
                          auth=HTTPBasicAuth(username, password), allow_redirects=True, cookies=cookies)

        logger.info("Comparing urls...")
        if self.fuzzy == 1.0:
            self.ok(self.response.text == r2.text,
                    'Urls don\'t have equal content: {tested} and {reference}'.format(tested=self.url,
                                                                                      reference=self.url2))
        else:
            actual_ratio = ratio(self.response.text, r2.text)
            self.ok(actual_ratio > self.fuzzy,
                    """
                    Urls don\'t have sufficiently similar content: {tested} and {reference} (expected {expected}, got {actual})
                    """
                    .format(
                        tested=self.url,
                        reference=self.url2,
                        expected=self.fuzzy,
                        actual=actual_ratio))

        return self.is_ok()
dac.py 文件源码 项目:dac 作者: jlonij 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def set_levenshtein(self):
        '''
        Mean and max Levenshtein ratio for all labels.
        '''
        if not [f for f in self.features if f.startswith('match_str_lsr')]:
            return

        ne = self.cluster.entities[0].norm

        # Pref label
        l = self.document.get('pref_label')
        self.match_str_lsr_pref = Levenshtein.ratio(ne, l)

        # Wikidata alt labels
        if self.document.get('wd_alt_label'):
            wd_labels = self.document.get('wd_alt_label')
            ratios = [Levenshtein.ratio(ne, l) for l in wd_labels]
            self.match_str_lsr_wd_max = max(ratios) - 0.5
            self.match_str_lsr_wd_mean = (sum(ratios) /
                float(len(wd_labels))) - 0.375
        else:
            wd_labels = []

        # Any other alt labels
        if self.document.get('alt_label'):
            labels = self.document.get('alt_label')
            labels = [l for l in labels if l not in wd_labels]
            if labels:
                ratios = [Levenshtein.ratio(ne, l) for l in labels]
                self.match_str_lsr_alt_max = max(ratios) - 0.5
                self.match_str_lsr_alt_mean = (sum(ratios) /
                        float(len(labels))) - 0.375
preprocessingSQ.py 文件源码 项目:nlpcc2016 作者: huangxiangzhou 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def appendWordNetStemmingDict(inputPath='stemmingDict.old', outputPath='stemmingDict',outputEncoding='utf8'):

    oldDict = json.load(open(inputPath,'r',encoding='utf8'))
    distance = Levenshtein.ratio
    fi = open('wordnet.map','r',encoding='utf8')
    fo = open(outputPath,'w',encoding='utf8')

    for m in list(oldDict):
        tmp = set()
        for l in list(oldDict[m]):
            tmp.add(l[0])
        oldDict[m] = set(tmp)

    for line in fi:
        m = line.strip().split(' ')
        if len(m) == 0:
            continue
        if m[0] not in oldDict:
            oldDict[m[0]]=set()
        oldDict[m[0]].add(m[1])


    for m in list(oldDict):
        oldDict[m] = list(oldDict[m])
        for i in range(len(oldDict[m])):
            if type(oldDict[m][i]) != str or type(m) != str:
                print(oldDict[m])
                input()
                continue
            oldDict[m][i] = [oldDict[m][i],distance(oldDict[m][i],m)]

    json.dump(oldDict,fo)




    fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding)

    for key in oldDict:
        fotxt.write(key + ' ' + str(oldDict[key]) + '\n')

    fotxt.close()               

##
##print('Dumping stemming mpping to json format......')
##generateStemmingDict()
##appendWordNetStemmingDict()
##print('Done!')
coreMFAPCount.py 文件源码 项目:nlpcc2016 作者: huangxiangzhou 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def calScoreSub(self, countCharDict):

        distance = Levenshtein.ratio
        q = self.qRaw
        scoreSub = 0

        sub = ''

        if type(self.sub) == str:

            sub = self.sub
            subSplit = sub.split(' ')
            if sub in q:   
                for w in subSplit:
                    if w in countCharDict:
                        scoreSub += 1/(countCharDict[w] + 1)
                    else:
                        scoreSub += 1
            else:
                subSet = set(subSplit)
                qSet = set(q.split(' '))
                for w in (subSet & qSet):
                    if w in countCharDict:
                        scoreSub += 1/(countCharDict[w] + 1)
                    else:
                        scoreSub += 1
                if len(subSet) != 0:
                    scoreSub = scoreSub/len(subSet)


        if type(self.sub) == list:
            for s in self.sub[0]:
                sub += s + ' '
            sub = sub.strip()


        if type(self.sub) == list:
            if len(self.sub[0]) == len(self.sub[1]):
                lenSub = len(self.sub[0])
                for i in range(lenSub):
                    w = self.sub[0][i]
                    wC = self.sub[1][i]
                    if w in countCharDict:
                        scoreSub += 1/(countCharDict[w] + 1)*distance(w,wC)
                    else:
                        scoreSub += 1*distance(w,wC)
                scoreSub = scoreSub / lenSub

            else:
                subIntersaction = set(self.sub[0]) & set(self.sub[1])
                scoreSub = len(subIntersaction) / len(set(self.sub[0]) | set(self.sub[1]))



        self.scoreSub = scoreSub

        return scoreSub
coreMFAPCount.py 文件源码 项目:nlpcc2016 作者: huangxiangzhou 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def calScorePreLast(self, countCharDict,qWithoutSubSet,stemmingDict):

        distance = Levenshtein.ratio
        pre = self.pre
        scorePre = 0

        lastPreIndex = pre.rfind('.')
        if lastPreIndex != -1:
            preLowerSet = set(re.split(r' ',pre[lastPreIndex+1:]))
        else:
            preLowerSet = set(re.split(r' ',pre))

        preLower = list(preLowerSet)
        preLowerSet = set()

        for i in range(len(preLower)):
            if preLower[i] in stemmingDict:
                preLower[i] = stemmingDict[preLower[i]][0][0]
            preLowerSet.add(preLower[i])


        maxIntersection = qWithoutSubSet & preLowerSet



        preFactor = 0
        for char in maxIntersection:
            if char in countCharDict:
                preFactor += 1/(countCharDict[char] + 1)
            else:
                preFactor += 1


        if len(maxIntersection) == 0:
            for w1 in qWithoutSubSet:
                for w2 in preLowerSet:
                    if w1 == '' or w2 == '' or w1[0] != w2[0]:
                        continue
                    div = 1
                    if w1 in countCharDict:
                        div = countCharDict[w1] + 1
                    dWord = distance(w1,w2) / div
                    if preFactor < dWord:
                        preFactor = dWord



        if len(pre) != 0:
            scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
        else:
            scorePre = 0



        self.scorePreLast = scorePre


        return scorePre
coreMFAPCount.py 文件源码 项目:nlpcc2016 作者: huangxiangzhou 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def calScorePreAll(self, countCharDict, qWithoutSubSet,stemmingDict):

        distance = Levenshtein.ratio
        pre = self.pre
        scorePre = 0

        preLowerSet = set(re.split(r' |\.',pre))

        preLower = list(preLowerSet)
        preLowerSet = set()

        for i in range(len(preLower)):
            if preLower[i] in stemmingDict:
                preLower[i] = stemmingDict[preLower[i]][0][0]
            preLowerSet.add(preLower[i])


        maxIntersection = qWithoutSubSet & preLowerSet



        preFactor = 0
        for char in maxIntersection:
            if char in countCharDict:
                preFactor += 1/(countCharDict[char] + 1)
            else:
                preFactor += 1


        if len(maxIntersection) == 0:
            for w1 in qWithoutSubSet:
                for w2 in preLowerSet:
                    if w1 == '' or w2 == '' or w1[0] != w2[0]:
                        continue
                    div = 1
                    if w1 in countCharDict:
                        div = countCharDict[w1] + 1
                    dWord = distance(w1,w2) / div
                    if preFactor < dWord:
                        preFactor = dWord



        if len(pre) != 0:
            scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
        else:
            scorePre = 0



        self.scorePreAll = scorePre


        return scorePre
coreMF.py 文件源码 项目:nlpcc2016 作者: huangxiangzhou 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def calScorePreLast(self, countCharDict,qWithoutSubSet,stemmingDict):

        distance = Levenshtein.ratio
        pre = self.pre
        scorePre = 0

        lastPreIndex = pre.rfind('.')
        if lastPreIndex != -1:
            preLowerSet = set(re.split(r' ',pre[lastPreIndex+1:]))
        else:
            preLowerSet = set(re.split(r' ',pre))

        preLower = list(preLowerSet)
        preLowerSet = set()

        for i in range(len(preLower)):
            if preLower[i] in stemmingDict:
                preLower[i] = stemmingDict[preLower[i]][0][0]
            preLowerSet.add(preLower[i])


        maxIntersection = qWithoutSubSet & preLowerSet



        preFactor = 0
        for char in maxIntersection:
            if char in countCharDict:
                preFactor += 1/(countCharDict[char] + 1)
            else:
                preFactor += 1


        if len(maxIntersection) == 0:
            for w1 in qWithoutSubSet:
                for w2 in preLowerSet:
                    if w1 == '' or w2 == '' or w1[0] != w2[0]:
                        continue
                    div = 1
                    if w1 in countCharDict:
                        div = countCharDict[w1] + 1
                    dWord = distance(w1,w2) / div
                    if preFactor < dWord:
                        preFactor = dWord



        if len(pre) != 0:
            scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
        else:
            scorePre = 0



        self.scorePreLast = scorePre


        return scorePre
coreMF.py 文件源码 项目:nlpcc2016 作者: huangxiangzhou 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def calScorePreAll(self, countCharDict, qWithoutSubSet,stemmingDict):

        distance = Levenshtein.ratio
        pre = self.pre
        scorePre = 0

        preLowerSet = set(re.split(r' |\.',pre))

        preLower = list(preLowerSet)
        preLowerSet = set()

        for i in range(len(preLower)):
            if preLower[i] in stemmingDict:
                preLower[i] = stemmingDict[preLower[i]][0][0]
            preLowerSet.add(preLower[i])


        maxIntersection = qWithoutSubSet & preLowerSet



        preFactor = 0
        for char in maxIntersection:
            if char in countCharDict:
                preFactor += 1/(countCharDict[char] + 1)
            else:
                preFactor += 1


        if len(maxIntersection) == 0:
            for w1 in qWithoutSubSet:
                for w2 in preLowerSet:
                    if w1 == '' or w2 == '' or w1[0] != w2[0]:
                        continue
                    div = 1
                    if w1 in countCharDict:
                        div = countCharDict[w1] + 1
                    dWord = distance(w1,w2) / div
                    if preFactor < dWord:
                        preFactor = dWord



        if len(pre) != 0:
            scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
        else:
            scorePre = 0



        self.scorePreAll = scorePre


        return scorePre
test_unique.py 文件源码 项目:Quora-Kaggle 作者: PPshrimpGo 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
   #df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1)
    print('get_w2v')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)

    df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x)))
    df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x)))
    df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x)))
    df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x)))

    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    #df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    #df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_calc')
    print now.strftime('%Y-%m-%d %H:%M:%S') 

    #df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
    df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)    

    df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1)
    df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
    df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)

    df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1)
    df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
    df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)

    df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x))
    df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x))
    df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x))
    df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x))


    df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x))
    df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x))
    df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x))
    del df_features['q1_unique_w2v_weight']
    del df_features['q2_unique_w2v_weight']
    del df_features['q1_unique_w2v']
    del df_features['q2_unique_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features
hu2004.py 文件源码 项目:opminreplicability 作者: epochx 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self,
                 stopwords=NLTKStopwords(),
                 min_support=MIN_SUPPORT,
                 max_words=MAX_WORDS,
                 min_psupport=MIN_PSUPPORT,
                 min_compact_support=MIN_COMPACT_SUPPORT,
                 max_compact_distance=MAX_COMPACT_DISTANCE,
                 adj_key=StemKey(),
                 adj_win_size=ADJ_NEARBY_DISTANCE ,
                 match=85,
                 compactness=True,
                 redundancy=True,
                 infrequent=True):
        """
        Model to extract aspects using the algorithm by Hu et al. (2004)

            stopwords             : iterable of strings to use as stopwords
            min_support           : int, minimum support of an item set
                                    (positive: percentage, negative: absolute
                                    number of transactions)
            min_compact_support   : int minimum number of compact sentences
                                    of an aspect
            max_words             : int, maximum number of word on each aspect,
            max_compact_distance  : int, maximum distance between consecutive
                                    words in an aspect
            adj_win_size          : int, maximum distance to look for
                                    adjectives near an aspect on a sentence
            min_psupport          : int, minimum pure support of an aspect
            adj_key               : lambda function to extract adjectives
            match                 : int, minimum similarity ratio (0-100] for
                                    matching (use <100 for fuzzy) default=
            compactness           : boolean, True to run "compactness pruning"
            redundancy            : boolean, True to run "redundancy pruning"
            infrequent            : boolean, True to also extract infrequent
                                    aspects
        """
        self.params = {"stopwords": stopwords,
                       "min_support": min_support,
                       "max_words": max_words,
                       "min_psupport": min_psupport,
                       "min_compact_support": min_compact_support,
                       "max_compact_distance": max_compact_distance,
                       "adj_key": adj_key,
                       "adj_win_size": adj_win_size,
                       "match": match,
                       "compactness": compactness,
                       "redundancy": redundancy,
                       "infrequent": infrequent}
mdmanager.py 文件源码 项目:B2FIND-Training 作者: EUDAT-Training 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def map_discipl(self,invalue,disctab):
        """
        Convert disciplines along B2FIND disciplinary list

        Copyright (C) 2014 Heinrich Widmann
        Licensed under AGPLv3.
        """

        retval=list()
        if type(invalue) is not list :
            inlist=re.split(r'[;&\s]\s*',invalue)
            inlist.append(invalue)
        else:
            seplist=[re.split(r"[;&]",i) for i in invalue]
            swlist=[re.findall(r"[\w']+",i) for i in invalue]
            inlist=swlist+seplist
            inlist=[item for sublist in inlist for item in sublist]
        for indisc in inlist :
           ##indisc=indisc.encode('ascii','ignore').capitalize()
           indisc=indisc.encode('utf8').replace('\n',' ').replace('\r',' ').strip().title()
           maxr=0.0
           maxdisc=''
           for line in disctab :
             try:
               disc=line[2].strip()
               r=lvs.ratio(indisc,disc)
             except Exception as e:
                 logging.error('[ERROR] %s in map_discipl : %s can not compared to %s !' % (e,indisc,disc))
                 continue
             if r > maxr  :
                 maxdisc=disc
                 maxr=r
                 ##HEW-T                   print('--- %s \n|%s|%s| %f | %f' % (line,indisc,disc,r,maxr)
           if maxr == 1 and indisc == maxdisc :
               logging.debug('  | Perfect match of %s : nothing to do' % indisc)
               retval.append(indisc.strip())
           elif maxr > 0.90 :
               logging.debug('   | Similarity ratio %f is > 0.90 : replace value >>%s<< with best match --> %s' % (maxr,indisc,maxdisc))
               ##return maxdisc
               retval.append(indisc.strip())
           else:
               logging.debug('   | Similarity ratio %f is < 0.90 compare value >>%s<< and discipline >>%s<<' % (maxr,indisc,maxdisc))
               continue

        if len(retval) > 0:
            retval=list(OrderedDict.fromkeys(retval)) ## this elemenates real duplicates
            return ';'.join(retval)
        else:
            return 'Not stated'


问题


面经


文章

微信
公众号

扫码关注公众号