def __init__(self, sess, ob_dim, ac_dim):
super().__init__(sess, ob_dim, ac_dim)
# Placeholders for our inputs. Note that actions are floats.
self.ob_no = tf.placeholder(shape=[None, ob_dim], name="obs", dtype=tf.float32)
self.ac_na = tf.placeholder(shape=[None, ac_dim], name="act", dtype=tf.float32)
self.adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
self.n = tf.shape(self.ob_no)[0]
# Special to the continuous case, the log std vector, it's a parameter.
# Also, make batch versions so we get shape (n,a) (or (1,a)), not (a,).
self.logstd_a = tf.get_variable("logstd", [ac_dim], initializer=tf.zeros_initializer())
self.oldlogstd_a = tf.placeholder(name="oldlogstd", shape=[ac_dim], dtype=tf.float32)
self.logstd_na = tf.ones(shape=(self.n,ac_dim), dtype=tf.float32) * self.logstd_a
self.oldlogstd_na = tf.ones(shape=(self.n,ac_dim), dtype=tf.float32) * self.oldlogstd_a
# The policy network and the logits, which are the mean of a Gaussian.
# Then don't forget to make an "old" version of that for KL divergences.
self.hidden1 = layers.fully_connected(self.ob_no,
num_outputs=32,
weights_initializer=layers.xavier_initializer(uniform=True),
activation_fn=tf.nn.relu)
self.hidden2 = layers.fully_connected(self.hidden1,
num_outputs=32,
weights_initializer=layers.xavier_initializer(uniform=True),
activation_fn=tf.nn.relu)
self.mean_na = layers.fully_connected(self.hidden2,
num_outputs=ac_dim,
weights_initializer=layers.xavier_initializer(uniform=True),
activation_fn=None)
self.oldmean_na = tf.placeholder(shape=[None, ac_dim], name='oldmean', dtype=tf.float32)
# Diagonal Gaussian distribution for sampling actions and log probabilities.
self.logprob_n = utils.gauss_log_prob(mu=self.mean_na, logstd=self.logstd_na, x=self.ac_na)
self.sampled_ac = (tf.random_normal(tf.shape(self.mean_na)) * tf.exp(self.logstd_na) + self.mean_na)[0]
# Loss function that we'll differentiate to get the policy gradient
self.surr_loss = - tf.reduce_mean(self.logprob_n * self.adv_n)
self.stepsize = tf.placeholder(shape=[], dtype=tf.float32)
self.update_op = tf.train.AdamOptimizer(self.stepsize).minimize(self.surr_loss)
# KL divergence and entropy among Gaussian(s).
self.kl = tf.reduce_mean(utils.gauss_KL(self.mean_na, self.logstd_na, self.oldmean_na, self.oldlogstd_na))
self.ent = 0.5 * ac_dim * tf.log(2.*np.pi*np.e) + 0.5 * tf.reduce_sum(self.logstd_a)
评论列表
文章目录