vocab_expansion.py 文件源码-python代码片段

def _expand_vocabulary(skip_thoughts_emb, skip_thoughts_vocab, word2vec):

    # Find words shared between the two vocabularies.
    print("Finding shared words")
    shared_words = [w for w in word2vec.vocab if w in skip_thoughts_vocab]

    # Select embedding vectors for shared words.
    print("Selecting embeddings for %d shared words" % len(shared_words))
    shared_st_emb = skip_thoughts_emb[[
        skip_thoughts_vocab[w] for w in shared_words]]
    shared_w2v_emb = word2vec[shared_words]

    # Train a linear regression model on the shared embedding vectors.
    print("Training linear regression model")
    model = sklearn.linear_model.LinearRegression()
    model.fit(shared_w2v_emb, shared_st_emb)

    # Create the expanded vocabulary.
    print("Creating embeddings for expanded vocabulary")
    embedding_map = collections.OrderedDict()
    print('Length of word2vec vocabulary: %d\n' % len(word2vec.vocab))
    for i, w in enumerate(word2vec.vocab):
        print('\rEmbedding %d' %(i+1), end='   ')
    # Ignore words with underscores (spaces).
        if "_" not in w:
            w_emb = model.predict(word2vec[w].reshape(1, -1))
            embedding_map[w] = w_emb.reshape(-1)

    for w in skip_thoughts_vocab:
        embedding_map[w] = skip_thoughts_emb[skip_thoughts_vocab[w]]

    print("Created expanded vocabulary of %d words", len(embedding_map))

    expanded_vocab = {}
    expanded_embeddings = np.zeros([len(embedding_map), paras.embedding_size])

    for i, w in enumerate(embedding_map.keys()):
        expanded_vocab[w] = i
        expanded_embeddings[i,:] = embedding_map[w]

    print('Saving expanded vocab and embeddings')
    with open(path + 'expanded_vocab.pkl', 'wb') as f:
        pkl.dump(expanded_vocab, f)

    embeddings_file = os.path.join(path, "expanded_embeddings.npy")
    np.save(embeddings_file, expanded_embeddings)

    return expanded_vocab, expanded_embeddings

# path = '../models/toronto_n5/'