deep_policy_agent.py 文件源码-python代码片段

def build_graph(self, graph):
        np.random.seed(self.random_seed)
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            # Dims: bs x num_steps x state_size
            self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.policy_params['nb_inputs']], name='inputs')
            input_shape = tf.shape(self.inputs)
            dynamic_batch_size, dynamic_num_steps = input_shape[0], input_shape[1]

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                policy_inputs = tf.reshape(self.inputs, [-1, self.policy_params['nb_inputs']])
                probs, actions = capacities.policy(self.policy_params, policy_inputs)
                self.probs = tf.reshape(probs, [dynamic_batch_size, dynamic_num_steps, self.policy_params['nb_outputs']])
                self.actions = tf.reshape(actions, [dynamic_batch_size, dynamic_num_steps, 1])
            self.action_t = self.actions[0, 0, 0]

            with tf.variable_scope('Training'):
                self.rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
                self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh")

                baseline = tf.reduce_mean(self.rewards)

                batch_size, num_steps = tf.shape(self.actions)[0], tf.shape(self.actions)[1]
                line_indices = tf.matmul( # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1])
                    , tf.ones([1, num_steps], dtype=tf.int32)
                )
                column_indices = tf.matmul( # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32)
                    , tf.reshape(tf.range(0, num_steps), [1, -1])
                )
                depth_indices = tf.cast(tf.squeeze(self.actions, 2), tf.int32)
                stacked_actions = tf.stack(
                    [line_indices, column_indices, depth_indices], 2
                )

                log_probs = tf.expand_dims(tf.log(tf.gather_nd(self.probs, stacked_actions)), 2)
                # We want to average on sequence
                self.loss = tf.reduce_mean( - tf.reduce_sum((log_probs * (self.rewards - baseline)) * self.mask_plh, 1))

                adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
                self.train_op = adam.minimize(self.loss, global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('av_score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)

        return graph