def load_word2vec_matrix(vec_file, word_index, config):
if os.path.isfile(DirConfig.W2V_CACHE):
print('---- Load word vectors from cache.')
embedding_matrix = np.load(open(DirConfig.W2V_CACHE, 'rb'))
return embedding_matrix
print('---- loading word2vec ...')
word2vec = KeyedVectors.load_word2vec_format(
vec_file, binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))
nb_words = min(config.MAX_NB_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((nb_words, config.WORD_EMBEDDING_DIM))
for word, i in word_index.items():
if word in word2vec.vocab:
embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % \
np.sum(np.sum(embedding_matrix, axis=1) == 0))
# check the words which not in embedding vectors
not_found_words = []
for word, i in word_index.items():
if word not in word2vec.vocab:
not_found_words.append(word)
np.save(open(DirConfig.W2V_CACHE, 'wb'), embedding_matrix)
return embedding_matrix
评论列表
文章目录