def compute_policy_gradient_sample_correction(
action_distrib, action_distrib_mu, action_value, v,
truncation_threshold):
"""Compute off-policy bias correction term wrt a sampled action."""
assert np.isscalar(v)
assert truncation_threshold is not None
with chainer.no_backprop_mode():
sample_action = action_distrib.sample().data
rho_dash_inv = compute_importance(
action_distrib_mu, action_distrib, sample_action)
if (truncation_threshold > 0 and
rho_dash_inv >= 1 / truncation_threshold):
return chainer.Variable(np.asarray([0], dtype=np.float32))
correction_weight = max(0, 1 - truncation_threshold * rho_dash_inv)
assert correction_weight <= 1
q = float(action_value.evaluate_actions(sample_action).data[0])
correction_advantage = q - v
return -(correction_weight *
action_distrib.log_prob(sample_action) *
correction_advantage)
评论列表
文章目录