def generalized_advantage_estimations(rewards, values, terminal=None, gamma=0.99, tau=0.95):
gae = 0.0
advantages = []
values = th.cat([values, V(T([0.0077]))])
for i in reversed(range(len(rewards))):
nonterminal = 1.0 - terminal[i]
delta = rewards[i] + gamma * values[i+1] * nonterminal - values[i]
gae = delta + gamma * tau * gae * nonterminal
advantages.insert(0, gae + values[i])
return th.cat(advantages)
评论列表
文章目录