def setup_models(self, hidden_layer_size, summary_file):
# setup the seperate core and target networks
self.core_state, self.core_q_values = build_model("core", self.state_size, self.num_actions, hidden_layer_size)
self.target_state, self.target_q_values = build_model("target", self.state_size, self.num_actions, hidden_layer_size)
# build the global copy op that will copy core network onto target
self.clobber_target_net_op = copy_all_vars(from_namespace="core", to_namespace="target",
affine_coefficient=self.target_network_update_coeff)
# left hand side of the bellman update; Q(s1, a)
self.core_action_mask = tf.placeholder(dtype=tf.float32, shape=[None, self.num_actions],
name="core_action_mask")
self.core_q_value_for_action = tf.reduce_sum(self.core_q_values * self.core_action_mask)
# right hand side of bellman update; reward + max_a Q(s2, a')
self.reward = tf.placeholder(dtype=tf.float32, name="reward")
self.discount_p = tf.placeholder(dtype=tf.float32, name="discount")
self.max_target_q_value_plus_reward = self.reward + (self.discount_p * tf.stop_gradient(tf.reduce_max(self.target_q_values)))
# for loss just use squared loss on the difference
self.temporal_difference_loss = tf.reduce_mean(tf.pow(self.max_target_q_value_plus_reward - self.core_q_value_for_action, 2))
self.learning_rate_p = tf.placeholder(dtype=tf.float32, name="learning_rate")
optimizer = tf.train.GradientDescentOptimizer(self.learning_rate_p)
#optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, decay=0.9)
gradients = optimizer.compute_gradients(self.temporal_difference_loss)
for i, (gradient, variable) in enumerate(gradients):
if gradient is None: # eg stop gradient cases
continue
gradients[i] = (tf.clip_by_norm(gradient, self.gradient_clip), variable)
tf.histogram_summary(variable.name, variable)
tf.histogram_summary(variable.name + '/gradients', gradient)
tf.scalar_summary("temporal_difference_loss", self.temporal_difference_loss)
self.train_op = optimizer.apply_gradients(gradients)
# build session
self.sess = tf.Session()
self.sess.run(tf.initialize_all_variables())
self.summaries = tf.merge_all_summaries()
self.summary_writer = tf.train.SummaryWriter(summary_file, self.sess.graph_def)
评论列表
文章目录