def reward(self, history_id, rewards):
"""Reward the previous action with reward.
Parameters
----------
history_id : int
The history id of the action to reward.
rewards : dictionary
The dictionary {action_id, reward}, where reward is a float.
"""
context = (self._historystorage
.get_unrewarded_history(history_id)
.context)
model = self._modelstorage.get_model()
w = model['w']
action_probs = model['action_probs']
action_ids = list(six.viewkeys(six.next(six.itervalues(context))))
# Update the model
for action_id, reward in six.viewitems(rewards):
y_hat = {}
v_hat = {}
for i in six.viewkeys(context):
y_hat[i] = (context[i][action_id] * reward
/ action_probs[action_id])
v_hat[i] = sum(
[context[i][k] / action_probs[k] for k in action_ids])
w[i] = w[i] * np.exp(
self.p_min / 2
* (y_hat[i] + v_hat[i]
* np.sqrt(np.log(len(context) / self.delta)
/ (len(action_ids) * self.max_rounds))))
self._modelstorage.save_model({
'action_probs': action_probs, 'w': w})
# Update the history
self._historystorage.add_reward(history_id, rewards)
评论列表
文章目录