data_util.py 文件源码-python代码片段

data_util.py 文件源码

python

阅读 31 收藏 0 点赞 0 评论 0

项目：BiMPM_keras 作者: ijinmao 项目源码文件源码

def load_word2vec_matrix(vec_file, word_index, config):
    if os.path.isfile(DirConfig.W2V_CACHE):
        print('---- Load word vectors from cache.')
        embedding_matrix = np.load(open(DirConfig.W2V_CACHE, 'rb'))
        return embedding_matrix

    print('---- loading word2vec ...')
    word2vec = KeyedVectors.load_word2vec_format(
        vec_file, binary=True)
    print('Found %s word vectors of word2vec' % len(word2vec.vocab))

    nb_words = min(config.MAX_NB_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((nb_words, config.WORD_EMBEDDING_DIM))
    for word, i in word_index.items():
        if word in word2vec.vocab:
            embedding_matrix[i] = word2vec.word_vec(word)
    print('Null word embeddings: %d' % \
          np.sum(np.sum(embedding_matrix, axis=1) == 0))

    # check the words which not in embedding vectors
    not_found_words = []
    for word, i in word_index.items():
        if word not in word2vec.vocab:
            not_found_words.append(word)

    np.save(open(DirConfig.W2V_CACHE, 'wb'), embedding_matrix)
    return embedding_matrix