MADDPG_Morvan_vector.py 文件源码-python代码片段

def learn_critic(self, x_ma, a_ma, r, x2_ma, s, a, s2, epoch=0):
        # ATTENTION!!!!
        # the key point is that we use constant a_ma to replace critic's tensor: self.a_ma
        # here we must replace this tensor, otherwise whole network crash
        # because critic must use constant a_ma to do gradient,
        # while actor must use its network tensor a_ma to do gradient
        # this is the trick!!
        _c_grad, _c_loss, _a_grads = self.sess.run(
            self.train_ops, feed_dict={X_MA: x_ma, self.a_ma: a_ma,
                                       R: r, X2_MA: x2_ma,
                                       S: s, S2: s2})

        summary = tf.Summary()
        # summary.value.add(tag='info/c_gradient{}'.format(self.agent_id),
        #                   simple_value=float(_c_grad))
        summary.value.add(tag='info/c_loss{}'.format(self.agent_id), simple_value=float(_c_loss))
        writer.add_summary(summary, epoch)
        writer.flush()



        # the following method for soft replace target params is computational expansive
        # target_params = (1-tau) * target_params + tau * eval_params
        # self.sess.run([tf.assign(t, (1 - self.tau) * t + self.tau * e) for t, e in zip(self.t_params, self.e_params)])

        # instead of above method, we use a hard replacement here
        if self.t_replace_counter % self.t_replace_iter == 0:
            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
        self.t_replace_counter += 1


# -------------------  Memory -------------------