def model(self, input_label, output_word, batch_size, vocabulary_size=VOCABULARY_SIZE,
embedding_size=EMBEDDINGS_SIZE, num_negative_samples=W2V_NEGATIVE_NUM_SAMPLES,
learning_rate_initial=W2V_LEARNING_RATE_INITIAL,
learning_rate_decay=W2V_LEARNING_RATE_DECAY,
learning_rate_decay_steps=W2V_LEARNING_RATE_DECAY_STEPS):
self.global_step = training_util.get_or_create_global_step()
# inputs/outputs
input_label_reshaped = tf.reshape(input_label, [batch_size])
output_word_reshaped = tf.reshape(output_word, [batch_size, 1])
# embeddings
matrix_dimension = [vocabulary_size, embedding_size]
self.embeddings = tf.get_variable(shape=matrix_dimension,
initializer=layers.xavier_initializer(), dtype=tf.float32,
name='embeddings')
embed = tf.nn.embedding_lookup(self.embeddings, input_label_reshaped)
# NCE loss
stddev = 1.0 / math.sqrt(embedding_size)
nce_weights = tf.Variable(tf.truncated_normal(matrix_dimension, stddev=stddev))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
nce_loss = tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
labels=output_word_reshaped, inputs=embed,
num_sampled=num_negative_samples, num_classes=vocabulary_size)
self.loss = tf.reduce_mean(nce_loss)
tf.summary.scalar('loss', self.loss)
# learning rate & optimizer
self.learning_rate = tf.train.exponential_decay(learning_rate_initial, self.global_step,
learning_rate_decay_steps,
learning_rate_decay, staircase=True,
name='learning_rate')
tf.summary.scalar('learning_rate', self.learning_rate)
sgd = tf.train.GradientDescentOptimizer(self.learning_rate)
self.optimizer = sgd.minimize(self.loss, global_step=self.global_step)
# saver to save the model
self.saver = tf.train.Saver()
# check a nan value in the loss
self.loss = tf.check_numerics(self.loss, 'loss is nan')
# embeddings
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = self.embeddings.name
filename_tsv = '{}_{}.tsv'.format('word2vec_dataset', vocabulary_size)
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
shutil.copy(os.path.join(DIR_DATA_WORD2VEC, filename_tsv), self.log_dir)
embedding.metadata_path = filename_tsv
summary_writer = tf.summary.FileWriter(self.log_dir)
projector.visualize_embeddings(summary_writer, config)
# normalize the embeddings to save them
norm = tf.sqrt(tf.reduce_sum(tf.square(self.embeddings), 1, keep_dims=True))
self.normalized_embeddings = self.embeddings / norm
return None
word2vec_train.py 文件源码
python
阅读 17
收藏 0
点赞 0
评论 0
评论列表
文章目录