dqn.py 文件源码-python代码片段

def _build_target(self):
        activation_fn = tf.nn.relu
        with tf.variable_scope('target'):
            self.t_s_t = tf.placeholder('float32', [None, self.state_size], name='t_s_t')

            # MLP Feature Extraction
            l1, self.w['target']['l1_w'], self.w['target']['l1_b'] = linear(self.t_s_t, 96, activation_fn=activation_fn, name='l1')
            #l2, self.w['target']['l2_w'], self.w['target']['l2_b'] = linear(l1, 16, activation_fn=activation_fn, name='l2')
            #l3, self.w['target']['l3_w'], self.w['target']['l3_b'] = linear(l2, 16, activation_fn=activation_fn, name='l3')
            l3 = l1
            if self.dueling:
                # Value Net : V(s) is scalar
                value_hid, self.w['target']['l4_val_w'], self.w['target']['l4_val_b'] = linear(l3, 32, activation_fn=activation_fn, name='value_hid')
                value, self.w['target']['val_w_out'], self.w['target']['val_w_b'] = linear(value_hid, 1, name='value_out')

                # Advantage Net : A(s) is vector with advantage given each action
                adv_hid, self.w['target']['l4_adv_w'], self.w['target']['l4_adv_b'] = linear(l3, 32, activation_fn=activation_fn, name='adv_hid')
                advantage, self.w['target']['adv_w_out'], self.w['target']['adv_w_b'] = linear(adv_hid, self.action_size, name='adv_out')

                # Average Dueling (Subtract mean advantage)
                q_target = value + (advantage - tf.reduce_mean(advantage, reduction_indices=1, keep_dims=True))

            else:
                l4, self.w['target']['l4_w'], self.w['target']['l4_b'] = linear(l3, 16, activation_fn=activation_fn, name='l4')
                q_target, self.w['target']['q_w'], self.w['target']['q_b'] = linear(l4, self.action_size, name='q')

            # The action we use will depend if we use double q learning
            target_q_idx = tf.placeholder('int32', [None, None], name='q_id')
            # Get the q values of the specified state/action indices
            target_q_with_idx = tf.gather_nd(q_target, target_q_idx)
            return q_target, target_q_idx, target_q_with_idx