def sampleAction(self, states):
# TODO: use this code piece when tf.multinomial gets better
# sample action from current policy
# actions = self.session.run(self.predicted_actions, {self.states: states})[0]
# return actions[0]
# temporary workaround
def softmax(y):
""" simple helper function here that takes unnormalized logprobs """
maxy = np.amax(y)
e = np.exp(y - maxy)
return e / np.sum(e)
# epsilon-greedy exploration strategy
if random.random() < self.exploration:
return random.randint(0, self.num_actions-1)
else:
action_scores = self.session.run(self.action_scores, {self.states: states})[0]
action_probs = softmax(action_scores) - 1e-5
action = np.argmax(np.random.multinomial(1, action_probs))
return action
评论列表
文章目录