inverted_files.py 文件源码

python
阅读 16 收藏 0 点赞 0 评论 0

项目:Information_retrieva_Projectl- 作者: Google1234 项目源码 文件源码
def make_inverted_index(filename,read_buff_size,output_file_record_size,web_record_numbers=100000):
    '''
    :param filename: ?????????.txt
    :param read_buff_size:????????????
    :param output_file_token_size:???????????????
    :param ????????????????????? ??????
    :return:??????
    '''
    #??????????????????
    block_read=read_block(read_buff_size,filename)
    punct = set(u'''/+%#:!),.:;?]}¢'"????????????????
    ?????????????????????????????
    ??•·???--?’”([{£¥'"??????????????????
    ?????????“‘-—_…''')
    Letters_and_numbers=set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
    buff_dir=filename[:-4]+'_buff' #?????????? ?????????????????????????
    if os.path.exists(buff_dir):
        pass
    else:
        os.mkdir(buff_dir)
    file_numbers=1
    while True:
        print "process :cuting word +making inverted_index files---->>>>",file_numbers*(output_file_record_size)*1.0/web_record_numbers
        spimi=SPIMI_Invert(buff_dir+'/'+str(file_numbers)+'.txt')
        count=0
        while True:
            doc_id,content=block_read.pop_token()
            if content==''or count==output_file_record_size:
                break
            content_list=jieba.lcut_for_search(content)
            spimi.push_id(doc_id)
            for j in range(len(content_list)):
                if  content_list[j] not in punct and content_list[j] not in Letters_and_numbers :
                    spimi.push_word(content_list[j])
            del content_list,doc_id,content
            count+=1
        spimi.push_word('')#?? ?????
        file_numbers+=1
        if content=='':
            break
    print ("process :cuting word +making inverted_index files---->>>>Finish")
    #????????
    merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],read_buff_size,buff_dir+'/')
    print "process:mergeing inverted index files----->Finish"
    #????????? ?-??????
    Dictionary.establish_ditionary(buff_dir+'/'+merged_filename+'.txt',read_buff_size,buff_dir+'/'+"Dictionary.txt")
    shutil.copy(buff_dir+'/'+merged_filename+'.txt',filename[:-4]+'_inverted_index.txt')#????
    shutil.copy(buff_dir+'/'+"Dictionary.txt",filename[:-4]+'_index_Dictionary.txt')
    shutil.rmtree(buff_dir)#?????
    del merged_filename,buff_dir,punct,Letters_and_numbers
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号