MADDPG_Morvan.py 文件源码-python代码片段

def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter,
                 a_ma, a2_ma, agent_id, agent_num):
        self.agent_id = agent_id
        self.agent_num = agent_num

        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr = learning_rate
        self.gamma = gamma
        self.t_replace_iter = t_replace_iter
        self.t_replace_counter = 0

        with tf.variable_scope('Critic{}'.format(self.agent_id)):
            # Input (s, a), output q
            local_a = a_ma[agent_id]
            self.a_ma = tf.concat(a_ma, axis=1)
            self.q = self._build_critic_net(X_MA, self.a_ma, 'eval_net', trainable=True)

            # Input (s_, a_), output q_ for q_target
            a2_ma = tf.concat(a2_ma, axis=1)
            self.q_ = self._build_critic_net(X2_MA, a2_ma, 'target_net', trainable=False)    # target_q is based on a_ from Actor's target_net

            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic{}/eval_net'.format(agent_id))
            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic{}/target_net'.format(agent_id))

        with tf.variable_scope('target_q{}'.format(self.agent_id)):
            self.target_q = R + self.gamma * self.q_

        with tf.variable_scope('TD_error{}'.format(self.agent_id)):
            self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))  # MSE

        with tf.variable_scope('C_train{}'.format(self.agent_id)):
            self.train_ops = []
            self.train_ops.append(tf.train.AdamOptimizer(self.lr).minimize(
                self.loss, var_list=self.e_params))  # C train only update c network, don't update a
            self.train_ops.append(self.loss)  # for tf.summary

        with tf.variable_scope('a_grad{}'.format(self.agent_id)):
            # tensor of gradients of each sample (None, a_dim)
            self.a_grads = tf.gradients(self.q, local_a)[0]  # only get dq/da, throw dq/dw
            self.train_ops.append(self.a_grads)