def _build_target(self):
activation_fn = tf.nn.relu
with tf.variable_scope('target'):
self.t_s_t = tf.placeholder('float32', [None, self.state_size], name='t_s_t')
# MLP Feature Extraction
l1, self.w['target']['l1_w'], self.w['target']['l1_b'] = linear(self.t_s_t, 96, activation_fn=activation_fn, name='l1')
#l2, self.w['target']['l2_w'], self.w['target']['l2_b'] = linear(l1, 16, activation_fn=activation_fn, name='l2')
#l3, self.w['target']['l3_w'], self.w['target']['l3_b'] = linear(l2, 16, activation_fn=activation_fn, name='l3')
l3 = l1
if self.dueling:
# Value Net : V(s) is scalar
value_hid, self.w['target']['l4_val_w'], self.w['target']['l4_val_b'] = linear(l3, 32, activation_fn=activation_fn, name='value_hid')
value, self.w['target']['val_w_out'], self.w['target']['val_w_b'] = linear(value_hid, 1, name='value_out')
# Advantage Net : A(s) is vector with advantage given each action
adv_hid, self.w['target']['l4_adv_w'], self.w['target']['l4_adv_b'] = linear(l3, 32, activation_fn=activation_fn, name='adv_hid')
advantage, self.w['target']['adv_w_out'], self.w['target']['adv_w_b'] = linear(adv_hid, self.action_size, name='adv_out')
# Average Dueling (Subtract mean advantage)
q_target = value + (advantage - tf.reduce_mean(advantage, reduction_indices=1, keep_dims=True))
else:
l4, self.w['target']['l4_w'], self.w['target']['l4_b'] = linear(l3, 16, activation_fn=activation_fn, name='l4')
q_target, self.w['target']['q_w'], self.w['target']['q_b'] = linear(l4, self.action_size, name='q')
# The action we use will depend if we use double q learning
target_q_idx = tf.placeholder('int32', [None, None], name='q_id')
# Get the q values of the specified state/action indices
target_q_with_idx = tf.gather_nd(q_target, target_q_idx)
return q_target, target_q_idx, target_q_with_idx
评论列表
文章目录