python类lcut_for_search()的实例源码-面圈网

feature_extractor.py 文件源码项目：CNKICrawler 作者: roliygu 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def parse_item(item):
    """

    :param item: dictionary
    :return: void
    """

    def add_school_seq(t):
        t['school_seq'] = jieba.lcut_for_search(t['school'])

    def add_title_seq(t):
        t['title_seq'] = build_tf(t['title'])[1]

    def add_abstract_seq_and_tf(t):
        t['abstract_seq_tf'], t['abstract_seq'] = build_tf(t['abstract'])

    add_abstract_seq_and_tf(item)
    add_school_seq(item)
    add_title_seq(item)

    item['_id'] = str(item['_id'])

    return item

competition.py 文件源码项目：pyspider-clawswjtu 作者: HackSwjtu 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def __init__(self):
        self.conn = conn
        self.cursor = cursor
        sql = '''
            CREATE TABLE IF NOT EXISTS Competition(
              id INT PRIMARY KEY AUTO_INCREMENT,
              title VARCHAR(100),
              publishdate datetime,
              detail TEXT
        )ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;'''
        self.cursor.execute(sql)
        self.conn.commit()

        for match in competition_list:
            splitword = jieba.lcut_for_search(match) #????
            onesplit = []
            for word in splitword:
                if re.match('??|??|??|???|??|??|??|?|??|??|??'.decode('utf8'), word) == None:
                    onesplit.append(word)
            competition_split.append(onesplit)

cut.py 文件源码项目：dudulu 作者: MashiMaroLjc 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def search_cut(sentence):
    """
    HMM?????
    :param sentence:
    :return:
    """
    return jieba.lcut_for_search(sentence)

inverted_files.py 文件源码项目：Information_retrieva_Projectl- 作者: Google1234 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def make_inverted_index(filename,read_buff_size,output_file_record_size,web_record_numbers=100000):
    '''
    :param filename: ?????????.txt
    :param read_buff_size:????????????
    :param output_file_token_size:???????????????
    :param ????????????????????? ??????
    :return:??????
    '''
    #??????????????????
    block_read=read_block(read_buff_size,filename)
    punct = set(u'''/+%#:!),.:;?]}¢'"????????????????
    ?????????????????????????????
    ??•·???--?’”([{£¥'"??????????????????
    ?????????“‘-—_…''')
    Letters_and_numbers=set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
    buff_dir=filename[:-4]+'_buff' #?????????? ?????????????????????????
    if os.path.exists(buff_dir):
        pass
    else:
        os.mkdir(buff_dir)
    file_numbers=1
    while True:
        print "process :cuting word +making inverted_index files---->>>>",file_numbers*(output_file_record_size)*1.0/web_record_numbers
        spimi=SPIMI_Invert(buff_dir+'/'+str(file_numbers)+'.txt')
        count=0
        while True:
            doc_id,content=block_read.pop_token()
            if content==''or count==output_file_record_size:
                break
            content_list=jieba.lcut_for_search(content)
            spimi.push_id(doc_id)
            for j in range(len(content_list)):
                if  content_list[j] not in punct and content_list[j] not in Letters_and_numbers :
                    spimi.push_word(content_list[j])
            del content_list,doc_id,content
            count+=1
        spimi.push_word('')#?? ?????
        file_numbers+=1
        if content=='':
            break
    print ("process :cuting word +making inverted_index files---->>>>Finish")
    #????????
    merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],read_buff_size,buff_dir+'/')
    print "process:mergeing inverted index files----->Finish"
    #????????? ?-??????
    Dictionary.establish_ditionary(buff_dir+'/'+merged_filename+'.txt',read_buff_size,buff_dir+'/'+"Dictionary.txt")
    shutil.copy(buff_dir+'/'+merged_filename+'.txt',filename[:-4]+'_inverted_index.txt')#????
    shutil.copy(buff_dir+'/'+"Dictionary.txt",filename[:-4]+'_index_Dictionary.txt')
    shutil.rmtree(buff_dir)#?????
    del merged_filename,buff_dir,punct,Letters_and_numbers

simpleDrQA.py 文件源码项目：DrQA_cn 作者: AmoseKang 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def releventScore(self, text, ques, tfidf={}):
        def filtWord(li):
            # filt out stop words
            nl = []
            for l in li:
                if l not in STOPWORDS:
                    nl.append(l)
            return nl

        def sims(t, q):
            if t in self.dic.keys() and q in self.dic.keys():
                vector1 = self.dic[t]
                vector2 = self.dic[q]
                dot_product = 0.0
                normA = 0.0
                normB = 0.0
                for a, b in zip(vector1, vector2):
                    dot_product += a * b
                    normA += a**2
                    normB += b**2
                if normA == 0.0 or normB == 0.0:
                    return 0
                else:
                    return dot_product / ((normA * normB)**0.5)
            else:
                l = max([len(t), len(q)])
                if Levenshtein.distance(t, q) < l:
                    return (l - Levenshtein.distance(t, q)) / l * 0.7
                else:
                    return 0

        ttoks = filtWord(jieba.lcut_for_search(text))
        qtoks = filtWord(jieba.lcut_for_search(ques))

        score = 0
        if len(ttoks) == 0:
            return 0
        for tword in ttoks:
            for qword in qtoks:

                if tword in tfidf.keys():
                    rate = tfidf[tword]
                else:
                    rate = 1

                if tword == qword:
                    # exact match
                    score += rate * 2.5
                elif sims(tword, qword) > 0.4:
                    # similar
                    score += sims(tword, qword) * rate
        # remove advantage of length
        return score / len(ttoks) / len(qtoks) * 100