def __init__(self, sess, ob_dim, ac_dim):
super().__init__(sess, ob_dim, ac_dim)
# Placeholders for our inputs.
self.ob_no = tf.placeholder(shape=[None, ob_dim], name="obs", dtype=tf.float32)
self.ac_n = tf.placeholder(shape=[None], name="act", dtype=tf.int32)
self.adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
self.oldlogits_na = tf.placeholder(shape=[None, ac_dim], name='oldlogits', dtype=tf.float32)
# Form the policy network and the log probabilities.
self.hidden1 = layers.fully_connected(self.ob_no,
num_outputs=50,
weights_initializer=layers.xavier_initializer(uniform=True),
activation_fn=tf.nn.tanh)
self.logits_na = layers.fully_connected(self.hidden1,
num_outputs=ac_dim,
weights_initializer=layers.xavier_initializer(uniform=True),
activation_fn=None)
self.logp_na = tf.nn.log_softmax(self.logits_na)
# Log probabilities of the actions in the minibatch, plus sampled action.
self.nbatch = tf.shape(self.ob_no)[0]
self.logprob_n = utils.fancy_slice_2d(self.logp_na, tf.range(self.nbatch), self.ac_n)
self.sampled_ac = utils.categorical_sample_logits(self.logits_na)[0]
# Policy gradients loss function and training step.
self.surr_loss = - tf.reduce_mean(self.logprob_n * self.adv_n)
self.stepsize = tf.placeholder(shape=[], dtype=tf.float32)
self.update_op = tf.train.AdamOptimizer(self.stepsize).minimize(self.surr_loss)
# For KL divergence and entropy diagnostic purposes. These are computed
# as averages across individual KL/entropy w.r.t each minibatch state.
self.oldlogp_na = tf.nn.log_softmax(self.oldlogits_na)
self.oldp_na = tf.exp(self.oldlogp_na)
self.p_na = tf.exp(self.logp_na)
self.kl_n = tf.reduce_sum(self.oldp_na * (self.oldlogp_na - self.logp_na), axis=1)
# I'm not sure why the KL divergence can be slightly negative. Each row
# corresponds to a valid distribution. Must be numerical instability?
self.assert_op = tf.Assert(tf.reduce_all(self.kl_n >= -1e-4), [self.kl_n])
with tf.control_dependencies([self.assert_op]):
self.kl_n = tf.identity(self.kl_n)
self.kl = tf.reduce_mean(self.kl_n)
self.ent = tf.reduce_mean(tf.reduce_sum( -self.p_na * self.logp_na, axis=1))
评论列表
文章目录