def build_graph(self, graph):
with graph.as_default():
tf.set_random_seed(self.random_seed)
self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh")
q_scope = tf.VariableScope(reuse=False, name='QValues')
with tf.variable_scope(q_scope):
self.Qs = tf.get_variable('Qs'
, shape=[self.nb_state, self.action_space.n]
, initializer=tf.constant_initializer(self.initial_q_value)
, dtype=tf.float32
)
tf.summary.histogram('Qarray', self.Qs)
self.q_preds_t = tf.gather(self.Qs, self.inputs_plh)
policy_scope = tf.VariableScope(reuse=False, name='Policy')
with tf.variable_scope(policy_scope):
if 'UCB' in self.config and self.config['UCB']:
self.actions_t, self.probs_t = capacities.tabular_UCB(
self.Qs, self.inputs_plh
)
else:
self.actions_t, self.probs_t = capacities.tabular_eps_greedy(
self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps
)
self.action_t = self.actions_t[0]
self.q_value_t = self.q_preds_t[0, self.action_t]
learning_scope = tf.VariableScope(reuse=False, name='Learning')
with tf.variable_scope(learning_scope):
self.targets_t = tf.placeholder(tf.float32, shape=[None], name="targets_t")
self.loss, self.train_op = capacities.tabular_learning_with_lr(
self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh, self.actions_t, self.targets_t
)
self.score_plh = tf.placeholder(tf.float32, shape=[])
self.score_sum_t = tf.summary.scalar('score', self.score_plh)
self.loss_plh = tf.placeholder(tf.float32, shape=[])
self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
self.all_summary_t = tf.summary.merge_all()
self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")
# Playing part
self.pscore_plh = tf.placeholder(tf.float32, shape=[])
self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)
return graph
tabular_td_0_nstep_agent.py 文件源码
python
阅读 24
收藏 0
点赞 0
评论 0
评论列表
文章目录