def eps_greedy(self, state, exploration_rate):
prop = np.random.uniform()
q_max = None
q_min = None
if prop < exploration_rate:
# Select a random action
action_index = np.random.randint(0, len(config.ale_actions))
else:
# Select a greedy action
state = Variable(state)
if config.use_gpu:
state.to_gpu()
q = self.compute_q_variable(state, test=True)
if config.use_gpu:
action_index = cuda.to_cpu(cuda.cupy.argmax(q.data))
q_max = cuda.to_cpu(cuda.cupy.max(q.data))
q_min = cuda.to_cpu(cuda.cupy.min(q.data))
else:
action_index = np.argmax(q.data)
q_max = np.max(q.data)
q_min = np.min(q.data)
action = self.get_action_with_index(action_index)
# No-op
self.no_op_count = self.no_op_count + 1 if action == 0 else 0
if self.no_op_count > config.rl_no_op_max:
no_op_index = np.argmin(np.asarray(config.ale_actions))
actions_without_no_op = []
for i in range(len(config.ale_actions)):
if i == no_op_index:
continue
actions_without_no_op.append(config.ale_actions[i])
action_index = np.random.randint(0, len(actions_without_no_op))
action = actions_without_no_op[action_index]
print "Reached no_op_max.", "New action:", action
return action, q_max, q_min
评论列表
文章目录