deep_q_agent.py 文件源码-python代码片段

def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.N0_t = tf.constant(self.N0, tf.float32, name='N_0')
            self.N = tf.Variable(0., dtype=tf.float32, name='N', trainable=False)
            self.min_eps_t = tf.constant(self.min_eps, tf.float32, name='min_eps')

            self.inputs = tf.placeholder(tf.float32, shape=[None, self.q_params['nb_inputs']], name='inputs')

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.q_values = tf.squeeze(capacities.value_f(self.q_params, self.inputs))

            self.action_t = capacities.eps_greedy(
                self.inputs, self.q_values, self.env.action_space.n, self.N0, self.min_eps
            )
            self.q_t = self.q_values[self.action_t]

            with tf.variable_scope('Training'):
                self.reward = tf.placeholder(tf.float32, shape=[], name="reward")
                self.next_state = tf.placeholder(tf.float32, shape=[1, self.q_params['nb_inputs']], name="nextState")
                self.next_action = tf.placeholder(tf.int32, shape=[], name="nextAction")

                with tf.variable_scope(q_scope, reuse=True):
                    next_q_values = tf.squeeze(capacities.value_f(self.q_params, self.next_state))
                target_q1 = tf.stop_gradient(self.reward + self.discount * next_q_values[self.next_action])
                target_q2 = self.reward
                is_done = tf.cast(self.next_state[0, 4], tf.bool)
                target_q = tf.where(is_done, target_q2, target_q1)
                with tf.control_dependencies([target_q]):
                    self.loss = 1/2 * tf.square(target_q - self.q_t)

                adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
                self.train_op = adam.minimize(self.loss, global_step=self.global_step)


            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)

        return graph