def tf_discounted_cumulative_reward(self, terminal, reward, discount, final_reward=0.0):
"""
Creates the TensorFlow operations for calculating the discounted cumulative rewards
for a given sequence of rewards.
Args:
terminal: Terminal boolean tensor.
reward: Reward tensor.
discount: Discount factor.
final_reward: Last reward value in the sequence.
Returns:
Discounted cumulative reward tensor.
"""
# TODO: n-step cumulative reward (particularly for envs without terminal)
def cumulate(cumulative, reward_and_terminal):
rew, term = reward_and_terminal
return tf.where(
condition=term,
x=rew,
y=(rew + cumulative * discount)
)
# Reverse since reward cumulation is calculated right-to-left, but tf.scan only works left-to-right
reward = tf.reverse(tensor=reward, axis=(0,))
terminal = tf.reverse(tensor=terminal, axis=(0,))
reward = tf.scan(fn=cumulate, elems=(reward, terminal), initializer=final_reward)
return tf.reverse(tensor=reward, axis=(0,))
评论列表
文章目录