def process_word2vec(word2vec_dir, vocab, save_path, random_init=True):
# read pre-trained word embedddings from the binary file
print('Loading google word2vec...')
word2vec_path = word2vec_dir + '/GoogleNews-vectors-negative300.bin.gz'
word_vectors = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
print('Word2vec loaded!')
if random_init:
word2vec = np.random.uniform(-0.25, 0.25, (len(vocab), 300))
else:
word2vec = np.zeros((len(vocab), 300))
found = 0
for idx, token in enumerate(vocab):
try:
vec = word_vectors[token]
except:
pass
else:
word2vec[idx, :] = vec
found += 1
del word_vectors
print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab), word2vec_path))
np.savez_compressed(save_path, word2vec=word2vec)
print("saved trimmed word2vec matrix at: {}".format(save_path))
# construct embedding vectors according to the GloVe word vectors and vocabulary
评论列表
文章目录