def _build_train(self):
activation_fn = tf.nn.relu
with tf.variable_scope('train'):
# batched s_t to batched q and q_action
self.s_t = tf.placeholder('float32', [None, self.state_size], name='s_t')
# MLP Feature Extraction (s_t -> l3)
l1, self.w['train']['l1_w'], self.w['train']['l1_b'] = linear(self.s_t, 96, activation_fn=activation_fn, name='l1')
#l2, self.w['train']['l2_w'], self.w['train']['l2_b'] = linear(l1, 16, activation_fn=activation_fn, name='l2')
#l3, self.w['train']['l3_w'], self.w['train']['l3_b'] = linear(l2, 16, activation_fn=activation_fn, name='l3')
l3 = l1
if self.dueling:
# Value Net : V(s) is scalar (l3 -> value)
value_hid, self.w['train']['l4_val_w'], self.w['train']['l4_val_b'] = linear(l3, 32, activation_fn=activation_fn, name='value_hid')
value, self.w['train']['val_w_out'], self.w['train']['val_w_b'] = linear(value_hid, 1, name='value_out')
# Advantage Net : A(s) is vector with advantage given each action (l3 -> advantage)
adv_hid, self.w['train']['l4_adv_w'], self.w['train']['l4_adv_b'] = linear(l3, 32, activation_fn=activation_fn, name='adv_hid')
advantage, self.w['train']['adv_w_out'], self.w['train']['adv_w_b'] = linear(adv_hid, self.action_size, name='adv_out')
# Average Dueling (Subtract mean advantage) Q=V+A-mean(A)
q_train = value + (advantage - tf.reduce_mean(advantage, reduction_indices=1, keep_dims=True))
else:
l4, self.w['train']['l4_w'], self.w['train']['l4_b'] = linear(l3, 16, activation_fn=activation_fn, name='l4')
q_train, self.w['train']['q_w'], self.w['train']['q_b'] = linear(l4, self.action_size, name='q')
# Greedy policy
q_action = tf.argmax(q_train, dimension=1)
return q_train, q_action
评论列表
文章目录