def __init__(self, lr, s_size,a_size):
#These lines established the feed-forward part of the network. The agent takes a state and produces an action.
self.state_in= tf.placeholder(shape=[1],dtype=tf.int32)
state_in_OH = slim.one_hot_encoding(self.state_in,s_size)
output = slim.fully_connected(state_in_OH,a_size,\
biases_initializer=None,activation_fn=tf.nn.sigmoid,weights_initializer=tf.ones_initializer())
self.output = tf.reshape(output,[-1])
self.chosen_action = tf.argmax(self.output,0)
#The next six lines establish the training proceedure. We feed the reward and chosen action into the network
#to compute the loss, and use it to update the network.
self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
self.update = optimizer.minimize(self.loss)
评论列表
文章目录