policies.py 文件源码-python代码片段

def __init__(self, sess, ob_dim, ac_dim):
        super().__init__(sess, ob_dim, ac_dim)

        # Placeholders for our inputs.
        self.ob_no = tf.placeholder(shape=[None, ob_dim], name="obs", dtype=tf.float32)
        self.ac_n  = tf.placeholder(shape=[None], name="act", dtype=tf.int32)
        self.adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
        self.oldlogits_na = tf.placeholder(shape=[None, ac_dim], name='oldlogits', dtype=tf.float32)

        # Form the policy network and the log probabilities.
        self.hidden1 = layers.fully_connected(self.ob_no, 
                num_outputs=50,
                weights_initializer=layers.xavier_initializer(uniform=True),
                activation_fn=tf.nn.tanh)
        self.logits_na = layers.fully_connected(self.hidden1, 
                num_outputs=ac_dim,
                weights_initializer=layers.xavier_initializer(uniform=True),
                activation_fn=None)
        self.logp_na = tf.nn.log_softmax(self.logits_na)

        # Log probabilities of the actions in the minibatch, plus sampled action.
        self.nbatch     = tf.shape(self.ob_no)[0]
        self.logprob_n  = utils.fancy_slice_2d(self.logp_na, tf.range(self.nbatch), self.ac_n)
        self.sampled_ac = utils.categorical_sample_logits(self.logits_na)[0]

        # Policy gradients loss function and training step.
        self.surr_loss = - tf.reduce_mean(self.logprob_n * self.adv_n)
        self.stepsize  = tf.placeholder(shape=[], dtype=tf.float32)
        self.update_op = tf.train.AdamOptimizer(self.stepsize).minimize(self.surr_loss)

        # For KL divergence and entropy diagnostic purposes. These are computed
        # as averages across individual KL/entropy w.r.t each minibatch state.
        self.oldlogp_na = tf.nn.log_softmax(self.oldlogits_na)
        self.oldp_na    = tf.exp(self.oldlogp_na)
        self.p_na       = tf.exp(self.logp_na)
        self.kl_n       = tf.reduce_sum(self.oldp_na * (self.oldlogp_na - self.logp_na), axis=1)

        # I'm not sure why the KL divergence can be slightly negative. Each row
        # corresponds to a valid distribution. Must be numerical instability?
        self.assert_op  = tf.Assert(tf.reduce_all(self.kl_n >= -1e-4), [self.kl_n]) 
        with tf.control_dependencies([self.assert_op]):
            self.kl_n = tf.identity(self.kl_n)
        self.kl  = tf.reduce_mean(self.kl_n)
        self.ent = tf.reduce_mean(tf.reduce_sum( -self.p_na * self.logp_na, axis=1))