tfidf_from_seg.py 文件源码-python代码片段

tfidf_from_seg.py 文件源码

python

阅读 23 收藏 0 点赞 0 评论 0

项目：Sohu-LuckData-Image-Text-Matching-Competition 作者: WeitaoVan 项目源码文件源码

def compute_word2vec(docs, DF, nDoc, model, vecDim=300):
    N = len(docs)
    nonExist_vocab = {}
    feat = np.zeros((N, 300), dtype=np.float32)
    for idx, doc in enumerate(docs):
        nonExist_list = []
        TF = {}
        spt = doc.split(' ')
        nWord = len(spt)
        update_vocab(TF, spt)
        vec = np.zeros(vecDim, dtype=np.float32)
        for word, tf in TF.items():
            try:
                tfidf = 1.0 * tf / nWord * np.log2(1.0 * nDoc / DF[word])
                vec += tfidf * word2vec(model, word)
            except:
                nonExist_list.append(word)
                pass
        feat[idx, :] = vec
        update_vocab(nonExist_vocab, nonExist_list)
        if np.mod(idx, 10000) == 0:
            print '# %d' %idx
            print 'nonExist: %d' %len(nonExist_vocab.keys())
    return feat, nonExist_vocab