def __init__(self, scope = 'policy_network', learning_rate = 0.001):
self.initializer = tf.contrib.layers.xavier_initializer()
with tf.variable_scope(scope):
self.state = tf.placeholder(tf.float32, [None, state_dim], name = 'state')
self.action = tf.placeholder(tf.int32, [None], name = 'action')
self.target = tf.placeholder(tf.float32, name = 'target')
self.action_prob = policy_nn(self.state, state_dim, action_space, self.initializer)
action_mask = tf.cast(tf.one_hot(self.action, depth = action_space), tf.bool)
self.picked_action_prob = tf.boolean_mask(self.action_prob, action_mask)
self.loss = tf.reduce_sum(-tf.log(self.picked_action_prob)*self.target) + sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=scope))
self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
self.train_op = self.optimizer.minimize(self.loss)
评论列表
文章目录