def backprop_check():
xp = cuda.cupy if config.use_gpu else np
duel = DDQN()
state = xp.random.uniform(-1.0, 1.0, (2, config.rl_agent_history_length * config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0])).astype(xp.float32)
reward = [1, 0]
action = [3, 4]
episode_ends = [0, 0]
next_state = xp.random.uniform(-1.0, 1.0, (2, config.rl_agent_history_length * config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0])).astype(xp.float32)
optimizer_conv = optimizers.Adam(alpha=config.rl_learning_rate, beta1=config.rl_gradient_momentum)
optimizer_conv.setup(duel.conv)
optimizer_fc = optimizers.Adam(alpha=config.rl_learning_rate, beta1=config.rl_gradient_momentum)
optimizer_fc.setup(duel.fc)
for i in xrange(10000):
optimizer_conv.zero_grads()
optimizer_fc.zero_grads()
loss, _ = duel.forward_one_step(state, action, reward, next_state, episode_ends)
loss.backward()
optimizer_conv.update()
optimizer_fc.update()
print loss.data,
print duel.conv.layer_2.W.data[0, 0, 0, 0],
print duel.fc.layer_2.W.data[0, 0],
评论列表
文章目录