def _expand_vocabulary(skip_thoughts_emb, skip_thoughts_vocab, word2vec):
# Find words shared between the two vocabularies.
print("Finding shared words")
shared_words = [w for w in word2vec.vocab if w in skip_thoughts_vocab]
# Select embedding vectors for shared words.
print("Selecting embeddings for %d shared words" % len(shared_words))
shared_st_emb = skip_thoughts_emb[[
skip_thoughts_vocab[w] for w in shared_words]]
shared_w2v_emb = word2vec[shared_words]
# Train a linear regression model on the shared embedding vectors.
print("Training linear regression model")
model = sklearn.linear_model.LinearRegression()
model.fit(shared_w2v_emb, shared_st_emb)
# Create the expanded vocabulary.
print("Creating embeddings for expanded vocabulary")
embedding_map = collections.OrderedDict()
print('Length of word2vec vocabulary: %d\n' % len(word2vec.vocab))
for i, w in enumerate(word2vec.vocab):
print('\rEmbedding %d' %(i+1), end=' ')
# Ignore words with underscores (spaces).
if "_" not in w:
w_emb = model.predict(word2vec[w].reshape(1, -1))
embedding_map[w] = w_emb.reshape(-1)
for w in skip_thoughts_vocab:
embedding_map[w] = skip_thoughts_emb[skip_thoughts_vocab[w]]
print("Created expanded vocabulary of %d words", len(embedding_map))
expanded_vocab = {}
expanded_embeddings = np.zeros([len(embedding_map), paras.embedding_size])
for i, w in enumerate(embedding_map.keys()):
expanded_vocab[w] = i
expanded_embeddings[i,:] = embedding_map[w]
print('Saving expanded vocab and embeddings')
with open(path + 'expanded_vocab.pkl', 'wb') as f:
pkl.dump(expanded_vocab, f)
embeddings_file = os.path.join(path, "expanded_embeddings.npy")
np.save(embeddings_file, expanded_embeddings)
return expanded_vocab, expanded_embeddings
# path = '../models/toronto_n5/'
评论列表
文章目录