def loadEmbedding(self, sess):
""" Initialize embeddings with pre-trained word2vec vectors
Will modify the embedding weights of the current loaded model
Uses the GoogleNews pre-trained values (path hardcoded)
"""
# Fetch embedding variables from model
with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True):
em_in = tf.get_variable("embedding")
with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True):
em_out = tf.get_variable("embedding")
# Disable training for embeddings
variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES)
variables.remove(em_in)
variables.remove(em_out)
# If restoring a model, we can leave here
if self.globStep != 0:
return
# New model, we load the pre-trained word2vec data and initialize embeddings
with open(os.path.join(self.args.rootDir, 'data/word2vec/GoogleNews-vectors-negative300.bin'), "rb", 0) as f:
header = f.readline()
vocab_size, vector_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * vector_size
initW = np.random.uniform(-0.25, 0.25, (len(self.textData.word2id), vector_size))
for line in tqdm(range(vocab_size)):
word = []
while True:
ch = f.read(1)
if ch == b' ':
word = b''.join(word).decode('utf-8')
break
if ch != b'\n':
word.append(ch)
if word in self.textData.word2id:
initW[self.textData.word2id[word]] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
# PCA Decomposition to reduce word2vec dimensionality
if self.args.embeddingSize < vector_size:
U, s, Vt = np.linalg.svd(initW, full_matrices=False)
S = np.zeros((vector_size, vector_size), dtype=complex)
S[:vector_size, :vector_size] = np.diag(s)
initW = np.dot(U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize])
# Initialize input and output embeddings
sess.run(em_in.assign(initW))
sess.run(em_out.assign(initW))
评论列表
文章目录