def _build_discounts_matrix(self, T, gamma):
"""Build lower-triangular matrix of discounts.
For example for T = 3: D = [[1, 0, 0]
[gamma, 1, 0]
[gamma^2, gamma, 1]]
Then with R, our N x T incremental rewards matrix, the discounted sum is
R * D
"""
power_ltri = tf.cumsum(
tf.sequence_mask(tf.range(T)+1, T, dtype=tf.float32), exclusive=True
)
gamma_ltri = tf.pow(gamma, power_ltri)
gamma_ltri *= tf.sequence_mask(tf.range(T)+1, T, dtype=tf.float32)
return gamma_ltri
评论列表
文章目录