def update_weights(self, f):
"""
Gradient-based update of current Critic parameters. Also return the
action gradients for the Actor update later. This is the dQ/da in the
paper, and Q is the current Q network, not the target Q network.
"""
feed = {
self.obs_t_BO: f['obs_t_BO'],
self.act_t_BA: f['act_t_BA'],
self.rew_t_B: f['rew_t_B'],
self.obs_tp1_BO: f['obs_tp1_BO'],
self.done_mask_B: f['done_mask_B']
}
action_grads_BA, _, l2_error = self.sess.run([self.act_grads_BA, \
self.optimize_c, self.l2_error], feed)
# We assume that the only item in the list has what we want.
assert len(action_grads_BA) == 1
return action_grads_BA[0], l2_error
评论列表
文章目录