def policy_gradient():
with tf.variable_scope("policy"):
params = tf.get_variable("policy_parameters", [4, 2])
state = tf.placeholder("float", [None, 4])
actions = tf.placeholder("float", [None, 2])
advantages = tf.placeholder("float", [None, 1])
reward_input = tf.placeholder("float")
episode_reward = tf.get_variable("episode_reward", initializer=tf.constant(0.))
episode_reward = reward_input
linear = tf.matmul(state, params)
probabilities = tf.nn.softmax(linear)
good_probabilities = tf.reduce_sum(tf.mul(probabilities, actions), reduction_indices=[1])
eligibility = tf.log(good_probabilities) * advantages
loss = -tf.reduce_sum(eligibility)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
tf.scalar_summary("loss", loss)
tf.scalar_summary("episode_reward", episode_reward)
return probabilities, state, actions, advantages, optimizer, reward_input, episode_reward
policy_gradient_actor_critic.py 文件源码
python
阅读 34
收藏 0
点赞 0
评论 0
评论列表
文章目录