def learn_critic(self, x_ma, a_ma, r, x2_ma, s, a, s2, epoch=0):
# ATTENTION!!!!
# the key point is that we use constant a_ma to replace critic's tensor: self.a_ma
# here we must replace this tensor, otherwise whole network crash
# because critic must use constant a_ma to do gradient,
# while actor must use its network tensor a_ma to do gradient
# this is the trick!!
_c_grad, _c_loss, _a_grads = self.sess.run(
self.train_ops, feed_dict={X_MA: x_ma, self.a_ma: a_ma,
R: r, X2_MA: x2_ma,
S: s, S2: s2})
summary = tf.Summary()
# summary.value.add(tag='info/c_gradient{}'.format(self.agent_id),
# simple_value=float(_c_grad))
summary.value.add(tag='info/c_loss{}'.format(self.agent_id), simple_value=float(_c_loss))
writer.add_summary(summary, epoch)
writer.flush()
# the following method for soft replace target params is computational expansive
# target_params = (1-tau) * target_params + tau * eval_params
# self.sess.run([tf.assign(t, (1 - self.tau) * t + self.tau * e) for t, e in zip(self.t_params, self.e_params)])
# instead of above method, we use a hard replacement here
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
self.t_replace_counter += 1
# ------------------- Memory -------------------
评论列表
文章目录