def surrogate_objective(policy_out):
"""
Create the surrogate objective for policy gradients.
Returns actions, rewards, objective.
"""
actions = tf.placeholder(tf.float32, [None, 2])
rewards = tf.placeholder(tf.float32, [None, 1])
objective = tf.tensordot(tf.log(policy_out), actions*rewards, axes=2)
return actions, rewards, objective
评论列表
文章目录