def tfidf_cluster_feature(data_txt_path, word2vec_distr_path, save_path, df_path, nDoc):
word2vec_distr = pk.load(open(word2vec_distr_path))
docs = open(data_txt_path).readlines()
DF = pk.load(open(df_path))
N = len(docs)
DIM = word2vec_distr.values()[0].shape[0]
h5file = h5py.File(save_path, 'w')
feat = h5file.create_dataset('feature', shape=(N, DIM), dtype=np.float32)
t0 = time.time()
for idx, doc in enumerate(docs):
words = doc.strip().split(' ')
feat[idx, :] = compute_tfidf_cluster_feat(words, DF, nDoc, word2vec_distr)
if np.mod(idx, 10000) == 0:
t = time.time() - t0
print '#%d, t = %d mins' %(idx, t/60)
h5file.close()
print 'saved to %s' %save_path
tfidf_weighted_word2vec.py 文件源码
python
阅读 39
收藏 0
点赞 0
评论 0
评论列表
文章目录