def eligibility_dutch_traces(Qs_t, states_t, actions_t, lr, discount, lambda_value):
# Beware this trace has to be used with a different learning rule
et = tf.get_variable(
"eligibilitytraces"
, shape=Qs_t.get_shape()
, dtype=tf.float32
, trainable=False
, initializer=tf.zeros_initializer()
)
tf.summary.histogram('eligibilitytraces', et)
state_action_pairs = tf.stack([states_t, actions_t], 1)
current_trace = tf.gather_nd(et, state_action_pairs)
updates = 1 - lr * discount * lambda_value * current_trace
with tf.control_dependencies([updates]):
dec_et_op = tf.assign(et, discount * lambda_value * et)
with tf.control_dependencies([dec_et_op]):
update_et_op = tf.scatter_nd_add(et, indices=state_action_pairs, updates=updates)
reset_et_op = et.assign(tf.zeros_like(et, dtype=tf.float32))
return (et, update_et_op, reset_et_op)
评论列表
文章目录