util.py 文件源码-python代码片段

util.py 文件源码

python

阅读 43 收藏 0 点赞 0 评论 0

项目：sentence-classification 作者: jind11 项目源码文件源码

def process_word2vec(word2vec_dir, vocab, save_path, random_init=True):

    # read pre-trained word embedddings from the binary file
    print('Loading google word2vec...')
    word2vec_path = word2vec_dir + '/GoogleNews-vectors-negative300.bin.gz'
    word_vectors = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
    print('Word2vec loaded!')

    if random_init:
        word2vec = np.random.uniform(-0.25, 0.25, (len(vocab), 300))
    else:
        word2vec = np.zeros((len(vocab), 300))
    found = 0
    for idx, token in enumerate(vocab):
        try:
            vec = word_vectors[token]
        except:
            pass
        else:
            word2vec[idx, :] = vec
            found += 1

    del word_vectors

    print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab), word2vec_path))
    np.savez_compressed(save_path, word2vec=word2vec)
    print("saved trimmed word2vec matrix at: {}".format(save_path))


# construct embedding vectors according to the GloVe word vectors and vocabulary