tfidf_weighted_word2vec.py 文件源码-python代码片段

tfidf_weighted_word2vec.py 文件源码

python

阅读 29 收藏 0 点赞 0 评论 0

项目：Sohu-LuckData-Image-Text-Matching-Competition 作者: WeitaoVan 项目源码文件源码

def doc2word2vec(data_txt_path, word2vec_model, save_path, dim=300, length=10):
    # do not use tf-idf values as coefficients.
    # usually because the data_txt_path is a tfidf-sorted text.
    # length = 1: mean of vectors
    # length > 1: concate vectors
    word2vec = pk.load(open(word2vec_model, 'r'))
    docs = open(data_txt_path).readlines()
    N = len(docs)
    feat = np.zeros((N, dim * length), dtype=np.float32)
    t0 = time.time()
    for idx, doc in enumerate(docs):
        words = doc.strip().split(' ')
        feat[idx, :] = create_vec_from_words(words, word2vec, dim, length)
        if np.mod(idx, 10000) == 0:
            t = time.time() - t0
            print '# %d, t = %d minutes' %(idx, t/60)
    h5file = h5py.File(save_path, 'w')
    h5file.create_dataset('feature', data=feat, dtype=np.float32)
    h5file.close()
    print 'saved to %s' %save_path