policies.py 文件源码-python代码片段

def __init__(self, sess, ob_dim, ac_dim):
        super().__init__(sess, ob_dim, ac_dim)

        # Placeholders for our inputs. Note that actions are floats.
        self.ob_no = tf.placeholder(shape=[None, ob_dim], name="obs", dtype=tf.float32)
        self.ac_na = tf.placeholder(shape=[None, ac_dim], name="act", dtype=tf.float32)
        self.adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
        self.n     = tf.shape(self.ob_no)[0]

        # Special to the continuous case, the log std vector, it's a parameter.
        # Also, make batch versions so we get shape (n,a) (or (1,a)), not (a,).
        self.logstd_a     = tf.get_variable("logstd", [ac_dim], initializer=tf.zeros_initializer())
        self.oldlogstd_a  = tf.placeholder(name="oldlogstd", shape=[ac_dim], dtype=tf.float32)
        self.logstd_na    = tf.ones(shape=(self.n,ac_dim), dtype=tf.float32) * self.logstd_a
        self.oldlogstd_na = tf.ones(shape=(self.n,ac_dim), dtype=tf.float32) * self.oldlogstd_a

        # The policy network and the logits, which are the mean of a Gaussian.
        # Then don't forget to make an "old" version of that for KL divergences.
        self.hidden1 = layers.fully_connected(self.ob_no, 
                num_outputs=32,
                weights_initializer=layers.xavier_initializer(uniform=True),
                activation_fn=tf.nn.relu)
        self.hidden2 = layers.fully_connected(self.hidden1, 
                num_outputs=32,
                weights_initializer=layers.xavier_initializer(uniform=True),
                activation_fn=tf.nn.relu)
        self.mean_na = layers.fully_connected(self.hidden2, 
                num_outputs=ac_dim,
                weights_initializer=layers.xavier_initializer(uniform=True),
                activation_fn=None)
        self.oldmean_na = tf.placeholder(shape=[None, ac_dim], name='oldmean', dtype=tf.float32)

        # Diagonal Gaussian distribution for sampling actions and log probabilities.
        self.logprob_n  = utils.gauss_log_prob(mu=self.mean_na, logstd=self.logstd_na, x=self.ac_na)
        self.sampled_ac = (tf.random_normal(tf.shape(self.mean_na)) * tf.exp(self.logstd_na) + self.mean_na)[0]

        # Loss function that we'll differentiate to get the policy  gradient
        self.surr_loss = - tf.reduce_mean(self.logprob_n * self.adv_n) 
        self.stepsize  = tf.placeholder(shape=[], dtype=tf.float32) 
        self.update_op = tf.train.AdamOptimizer(self.stepsize).minimize(self.surr_loss)

        # KL divergence and entropy among Gaussian(s).
        self.kl  = tf.reduce_mean(utils.gauss_KL(self.mean_na, self.logstd_na, self.oldmean_na, self.oldlogstd_na))
        self.ent = 0.5 * ac_dim * tf.log(2.*np.pi*np.e) + 0.5 * tf.reduce_sum(self.logstd_a)