def build_w2v_matrix(vocab_processor, w2v_path, vector_path, dim_size):
w2v_dict = {}
f = open(vector_path, 'r')
for line in f.readlines():
word, vec = line.strip().split(' ', 1)
w2v_dict[word] = np.loadtxt([vec], dtype='float32')
vocab_list = vocab_processor._reverse_mapping
w2v_W = np.zeros(shape=(len(vocab_list), dim_size), dtype='float32')
for i, vocab in enumerate(vocab_list):
# unknown vocab
if i == 0:
continue
else:
if vocab in w2v_dict:
w2v_W[i] = w2v_dict[vocab]
else:
w2v_W[i] = get_unknown_word_vec(dim_size)
cPickle.dump(w2v_W, open(w2v_path, 'wb'))
return w2v_W
评论列表
文章目录