def compute_word2vec(docs, DF, nDoc, model, vecDim=300):
N = len(docs)
nonExist_vocab = {}
feat = np.zeros((N, 300), dtype=np.float32)
for idx, doc in enumerate(docs):
nonExist_list = []
TF = {}
spt = doc.split(' ')
nWord = len(spt)
update_vocab(TF, spt)
vec = np.zeros(vecDim, dtype=np.float32)
for word, tf in TF.items():
try:
tfidf = 1.0 * tf / nWord * np.log2(1.0 * nDoc / DF[word])
vec += tfidf * word2vec(model, word)
except:
nonExist_list.append(word)
pass
feat[idx, :] = vec
update_vocab(nonExist_vocab, nonExist_list)
if np.mod(idx, 10000) == 0:
print '# %d' %idx
print 'nonExist: %d' %len(nonExist_vocab.keys())
return feat, nonExist_vocab
tfidf_from_seg.py 文件源码
python
阅读 23
收藏 0
点赞 0
评论 0
评论列表
文章目录