AlternatingAttention.py 文件源码-python代码片段

def __init__(self, batch_size, vocab_size, encoding_size, embedding_size,
                    num_glimpses = 8,
                    grad_norm_clip = 5.,
                    l2_reg_coef=1e-4,
                    session=tf.Session(),
                    name='AlternatingAttention'):
        """
        Creates an iterative alternating attention network as described in https://arxiv.org/abs/1606.02245
        """
        self._batch_size = batch_size
        self._vocab_size = vocab_size
        self._encode_size = encoding_size
        self._infer_size = 4 * encoding_size
        self._embedding_size = embedding_size
        self._num_glimpses = num_glimpses
        self._sess = session
        self._name = name

        self._build_placeholders()
        self._build_variables()

        # Regularization
        tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(l2_reg_coef), [self._embeddings])


        # Answer probability
        doc_attentions = self._inference(self._docs, self._queries)
        nans =  tf.reduce_sum(tf.to_float(tf.is_nan(doc_attentions)))

        self._doc_attentions = doc_attentions
        ans_mask = tf.to_float(tf.equal(tf.expand_dims(self._answers, -1), self._docs))
        P_a = tf.reduce_sum(ans_mask * doc_attentions, 1)
        loss_op = -tf.reduce_mean(tf.log(P_a + tf.constant(0.00001)))
        self._loss_op = loss_op

        # Optimizer and gradients
        with tf.name_scope("optimizer"):
            self._opt = tf.train.AdamOptimizer(learning_rate=self._learning_rate)
            grads_and_vars = self._opt.compute_gradients(loss_op)
            capped_grads_and_vars = [(tf.clip_by_norm(g, grad_norm_clip), v) for g,v in grads_and_vars]
            self._train_op = self._opt.apply_gradients(capped_grads_and_vars, global_step=self._global_step)

        tf.summary.scalar('loss', self._loss_op)
        tf.summary.scalar('learning_rate', self._learning_rate)
        tf.summary.histogram('answer_probability', P_a)
        self._summary_op = tf.summary.merge_all()

        self._sess.run(tf.global_variables_initializer())