def process_do(self, sub_batch_T, Thetas, cost_queue=None, queue=None, max_action_value_queue=None):
"""
sub_batch_T contains:
S_A_features, A, R, S'_As_features, isTerminate
"""
# all_S_A_features: shape(sample num, features num)
all_S_A_features = np.array([A for A in sub_batch_T['S_A_features'].values]).squeeze(2).T
all_Q = self.get_Q(all_S_A_features, Thetas)
# all_y_predict: shape(sample num, 1)
all_y_predict = all_Q
# all_next_S_As_features: shape(sample num, (features num, all_actions) )
all_next_S_As_features = sub_batch_T["S'_As_features"].values
all_next_Q_max = self.get_next_Q_max(all_next_S_As_features)
# all_isTerminate: shape(sample num, 1)
all_isTerminate = sub_batch_T['isTerminate'][:, np.newaxis]
# next_Q_max = 0 if it's terminate state
np.place(all_next_Q_max, all_isTerminate, 0)
all_reward = sub_batch_T['R'][:, np.newaxis]
# all_y: shape(sample num, 1)
all_y = all_reward + self.gamma * all_next_Q_max
Gradients = self.get_gradients_back_propagate(all_y, all_y_predict, Thetas)
thetas_sum = 0
for thetas in self.Thetas:
thetas_sum += np.square(thetas[1:, :]).sum(0).sum()
cost = 1 / (2 * len(sub_batch_T)) * \
(np.square(all_y-all_y_predict).sum(0).sum() + self.lambda_reg * thetas_sum)
max_action_value = np.max(all_next_Q_max)
print('Max action value: ', max_action_value)
if queue == None and cost_queue == None:
return [Gradients, cost, max_action_value]
else:
queue.put(Gradients)
cost_queue.put(cost)
max_action_value_queue.put(max_action_value)
评论列表
文章目录