def preturn_network(rewards, discounts, values):
# First reward must be zero, first discount must be one
first_reward = tf.Assert(
tf.reduce_all(tf.equal(rewards[:, 0, :], 0.0)), [rewards[:, 0, :]])
first_discount = tf.Assert(
tf.reduce_all(tf.equal(discounts[:, 0, :], 1.0)), [discounts[:, 0, :]])
with tf.control_dependencies([first_reward, first_discount]):
with tf.variable_scope('preturn'):
accum_value_discounts = tf.cumprod(discounts, axis=1, exclusive=False)
accum_reward_discounts = tf.cumprod(discounts, axis=1, exclusive=True)
discounted_values = values * accum_value_discounts
discounted_rewards = rewards * accum_reward_discounts
cumulative_rewards = tf.cumsum(discounted_rewards, axis=1)
preturns = cumulative_rewards + discounted_values
util.activation_summary(preturns)
return preturns
评论列表
文章目录