def build_graph(self, graph):
with graph.as_default():
tf.set_random_seed(self.random_seed)
self.N0_t = tf.constant(self.N0, tf.float32, name='N_0')
self.N = tf.Variable(0., dtype=tf.float32, name='N', trainable=False)
self.min_eps_t = tf.constant(self.min_eps, tf.float32, name='min_eps')
self.inputs = tf.placeholder(tf.float32, shape=[None, self.q_params['nb_inputs']], name='inputs')
q_scope = tf.VariableScope(reuse=False, name='QValues')
with tf.variable_scope(q_scope):
self.q_values = tf.squeeze(capacities.value_f(self.q_params, self.inputs))
self.action_t = capacities.eps_greedy(
self.inputs, self.q_values, self.env.action_space.n, self.N0, self.min_eps
)
self.q_t = self.q_values[self.action_t]
with tf.variable_scope('Training'):
self.reward = tf.placeholder(tf.float32, shape=[], name="reward")
self.next_state = tf.placeholder(tf.float32, shape=[1, self.q_params['nb_inputs']], name="nextState")
self.next_action = tf.placeholder(tf.int32, shape=[], name="nextAction")
with tf.variable_scope(q_scope, reuse=True):
next_q_values = tf.squeeze(capacities.value_f(self.q_params, self.next_state))
target_q1 = tf.stop_gradient(self.reward + self.discount * next_q_values[self.next_action])
target_q2 = self.reward
is_done = tf.cast(self.next_state[0, 4], tf.bool)
target_q = tf.where(is_done, target_q2, target_q1)
with tf.control_dependencies([target_q]):
self.loss = 1/2 * tf.square(target_q - self.q_t)
adam = tf.train.AdamOptimizer(self.lr)
self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
self.train_op = adam.minimize(self.loss, global_step=self.global_step)
self.score_plh = tf.placeholder(tf.float32, shape=[])
self.score_sum_t = tf.summary.scalar('score', self.score_plh)
self.loss_plh = tf.placeholder(tf.float32, shape=[])
self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
self.all_summary_t = tf.summary.merge_all()
self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")
# Playing part
self.pscore_plh = tf.placeholder(tf.float32, shape=[])
self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)
return graph
评论列表
文章目录