def __init__(self, batch_size, vocab_size, encoding_size, embedding_size,
num_glimpses = 8,
grad_norm_clip = 5.,
l2_reg_coef=1e-4,
session=tf.Session(),
name='AlternatingAttention'):
"""
Creates an iterative alternating attention network as described in https://arxiv.org/abs/1606.02245
"""
self._batch_size = batch_size
self._vocab_size = vocab_size
self._encode_size = encoding_size
self._infer_size = 4 * encoding_size
self._embedding_size = embedding_size
self._num_glimpses = num_glimpses
self._sess = session
self._name = name
self._build_placeholders()
self._build_variables()
# Regularization
tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(l2_reg_coef), [self._embeddings])
# Answer probability
doc_attentions = self._inference(self._docs, self._queries)
nans = tf.reduce_sum(tf.to_float(tf.is_nan(doc_attentions)))
self._doc_attentions = doc_attentions
ans_mask = tf.to_float(tf.equal(tf.expand_dims(self._answers, -1), self._docs))
P_a = tf.reduce_sum(ans_mask * doc_attentions, 1)
loss_op = -tf.reduce_mean(tf.log(P_a + tf.constant(0.00001)))
self._loss_op = loss_op
# Optimizer and gradients
with tf.name_scope("optimizer"):
self._opt = tf.train.AdamOptimizer(learning_rate=self._learning_rate)
grads_and_vars = self._opt.compute_gradients(loss_op)
capped_grads_and_vars = [(tf.clip_by_norm(g, grad_norm_clip), v) for g,v in grads_and_vars]
self._train_op = self._opt.apply_gradients(capped_grads_and_vars, global_step=self._global_step)
tf.summary.scalar('loss', self._loss_op)
tf.summary.scalar('learning_rate', self._learning_rate)
tf.summary.histogram('answer_probability', P_a)
self._summary_op = tf.summary.merge_all()
self._sess.run(tf.global_variables_initializer())
AlternatingAttention.py 文件源码
python
阅读 30
收藏 0
点赞 0
评论 0
评论列表
文章目录