def create_variables(self):
with tf.name_scope("model_inputs"):
# raw state representation
self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")
# rollout action based on current policy
with tf.name_scope("predict_actions"):
# initialize policy network
with tf.variable_scope("policy_network"):
self.policy_outputs = self.policy_network(self.states)
# predict actions from policy network
self.action_scores = tf.identity(self.policy_outputs, name="action_scores")
# Note 1: tf.multinomial is not good enough to use yet
# so we don't use self.predicted_actions for now
self.predicted_actions = tf.multinomial(self.action_scores, 1)
# regularization loss
policy_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy_network")
# compute loss and gradients
with tf.name_scope("compute_pg_gradients"):
# gradients for selecting action from policy network
self.taken_actions = tf.placeholder(tf.int32, (None,), name="taken_actions")
self.discounted_rewards = tf.placeholder(tf.float32, (None,), name="discounted_rewards")
with tf.variable_scope("policy_network", reuse=True):
self.logprobs = self.policy_network(self.states)
# compute policy loss and regularization loss
self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logprobs, labels=self.taken_actions)
self.pg_loss = tf.reduce_mean(self.cross_entropy_loss)
self.reg_loss = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in policy_network_variables])
self.loss = self.pg_loss + self.reg_param * self.reg_loss
# compute gradients
self.gradients = self.optimizer.compute_gradients(self.loss)
# compute policy gradients
for i, (grad, var) in enumerate(self.gradients):
if grad is not None:
self.gradients[i] = (grad * self.discounted_rewards, var)
for grad, var in self.gradients:
tf.summary.histogram(var.name, var)
if grad is not None:
tf.summary.histogram(var.name + '/gradients', grad)
# emit summaries
tf.summary.scalar("policy_loss", self.pg_loss)
tf.summary.scalar("reg_loss", self.reg_loss)
tf.summary.scalar("total_loss", self.loss)
# training update
with tf.name_scope("train_policy_network"):
# apply gradients to update policy network
self.train_op = self.optimizer.apply_gradients(self.gradients)
self.summarize = tf.summary.merge_all()
self.no_op = tf.no_op()
评论列表
文章目录