def _update_quantile(self):
states = np.array(self._quantile_states, dtype=np.float32)
limited_action_values = self.action_values(states, self._limited_action)
base_action_values = np.max(
np.array(
[
self.action_values(states, action)
for action in six.moves.range(self.num_actions)
if action != self._limited_action
]
),
axis=0
)
target = np.percentile(
limited_action_values - base_action_values, self._quantile
)
print("REWARD PENALTY TARGET:", target)
self.quantile_value += self._quantile_update_rate * target
print("QUANTILE:", self.quantile_value)
limited_discrete_action_trainer.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录