def __init__(self, learning_rate = 0.001):
self.initializer = tf.contrib.layers.xavier_initializer()
with tf.variable_scope('supervised_policy'):
self.state = tf.placeholder(tf.float32, [None, state_dim], name = 'state')
self.action = tf.placeholder(tf.int32, [None], name = 'action')
self.action_prob = policy_nn(self.state, state_dim, action_space, self.initializer)
action_mask = tf.cast(tf.one_hot(self.action, depth = action_space), tf.bool)
self.picked_action_prob = tf.boolean_mask(self.action_prob, action_mask)
self.loss = tf.reduce_sum(-tf.log(self.picked_action_prob)) + sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope = 'supervised_policy'))
self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
self.train_op = self.optimizer.minimize(self.loss)
评论列表
文章目录