def reward_from_reward_rnn_scores(self, action, reward_scores):
"""Rewards based on probabilities learned from data by trained RNN.
Computes the reward_network's learned softmax probabilities. When used as
rewards, allows the model to maintain information it learned from data.
Args:
action: A one-hot encoding of the chosen action.
reward_scores: The value for each note output by the reward_rnn.
Returns:
Float reward value.
"""
action_note = np.argmax(action)
normalization_constant = logsumexp(reward_scores)
return reward_scores[action_note] - normalization_constant
评论列表
文章目录