def _compute_max_qval_action_pair(self, state, horizon=None):
'''
Args:
state (State)
horizon (int): Indicates the level of recursion depth for computing Q.
Returns:
(tuple) --> (float, str): where the float is the Qval, str is the action.
'''
# If this is the first call, use the default horizon.
if horizon is None:
horizon = self.horizon
# Grab random initial action in case all equal
best_action = random.choice(self.actions)
max_q_val = self.get_q_value(state, best_action, horizon)
# Find best action (action w/ current max predicted Q value)
for action in self.actions:
q_s_a = self.get_q_value(state, action, horizon)
if q_s_a > max_q_val:
max_q_val = q_s_a
best_action = action
return max_q_val, best_action
评论列表
文章目录