exp4p.py 文件源码-python代码片段

def reward(self, history_id, rewards):
        """Reward the previous action with reward.

        Parameters
        ----------
        history_id : int
            The history id of the action to reward.

        rewards : dictionary
            The dictionary {action_id, reward}, where reward is a float.
        """
        context = (self._historystorage
                   .get_unrewarded_history(history_id)
                   .context)

        model = self._modelstorage.get_model()
        w = model['w']
        action_probs = model['action_probs']
        action_ids = list(six.viewkeys(six.next(six.itervalues(context))))

        # Update the model
        for action_id, reward in six.viewitems(rewards):
            y_hat = {}
            v_hat = {}
            for i in six.viewkeys(context):
                y_hat[i] = (context[i][action_id] * reward
                            / action_probs[action_id])
                v_hat[i] = sum(
                    [context[i][k] / action_probs[k] for k in action_ids])
                w[i] = w[i] * np.exp(
                    self.p_min / 2
                    * (y_hat[i] + v_hat[i]
                       * np.sqrt(np.log(len(context) / self.delta)
                                 / (len(action_ids) * self.max_rounds))))

        self._modelstorage.save_model({
            'action_probs': action_probs, 'w': w})

        # Update the history
        self._historystorage.add_reward(history_id, rewards)