def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter,
a_ma, a2_ma, agent_id, agent_num):
self.agent_id = agent_id
self.agent_num = agent_num
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Critic{}'.format(self.agent_id)):
# Input (s, a), output q
local_a = a_ma[agent_id]
self.a_ma = tf.concat(a_ma, axis=1)
self.q = self._build_critic_net(X_MA, self.a_ma, 'eval_net', trainable=True)
# Input (s_, a_), output q_ for q_target
a2_ma = tf.concat(a2_ma, axis=1)
self.q_ = self._build_critic_net(X2_MA, a2_ma, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic{}/eval_net'.format(agent_id))
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic{}/target_net'.format(agent_id))
with tf.variable_scope('target_q{}'.format(self.agent_id)):
self.target_q = R + self.gamma * self.q_
with tf.variable_scope('TD_error{}'.format(self.agent_id)):
self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q)) # MSE
with tf.variable_scope('C_train{}'.format(self.agent_id)):
self.train_ops = []
self.train_ops.append(tf.train.AdamOptimizer(self.lr).minimize(
self.loss, var_list=self.e_params)) # C train only update c network, don't update a
self.train_ops.append(self.loss) # for tf.summary
with tf.variable_scope('a_grad{}'.format(self.agent_id)):
# tensor of gradients of each sample (None, a_dim)
self.a_grads = tf.gradients(self.q, local_a)[0] # only get dq/da, throw dq/dw
self.train_ops.append(self.a_grads)
评论列表
文章目录