tfidf_from_seg.py 文件源码-python代码片段

tfidf_from_seg.py 文件源码

python

阅读 38 收藏 0 点赞 0 评论 0

项目：Sohu-LuckData-Image-Text-Matching-Competition 作者: WeitaoVan 项目源码文件源码

def tfidf(data_txt_path, df_path, nDoc, word2id_path, save_path):
    t0 = time.time()
    docs = open(data_txt_path).readlines()
    word2id = pk.load(open(word2id_path, 'r'))
    N = len(docs)
    DIM = len(word2id.keys())
    h5file = h5py.File(save_path, 'w')
    h5set = h5file.create_dataset('feature', shape=(N, DIM), dtype=np.float32)
    print 'word2id loaded from %s' %word2id_path
    print 'dataset created, shape (%d, %d)' %(N, DIM)
    # load DF
    DF = pk.load(open(df_path))
    # compute tfidf
    for idx, doc in enumerate(docs):
        feat= compute_tfidf(doc, DIM, DF, nDoc, word2id)
        h5set[idx, :] = feat.copy()
        if np.mod(idx, 10000) ==0:
            t = time.time() - t0
            print '# %d, t = %f hours' %(idx, t / 3600.)
    h5file.close()
    print 'TF-IDF feature saved to %s' %save_path