train_agent_chainer.py 文件源码-python代码片段

def create_acer_agent(env):
    obs_dim = env.observation_space.shape[0]
    n_actions = env.action_space.n

    model = acer.ACERSeparateModel(
        pi=links.Sequence(
            L.Linear( obs_dim, 1024, initialW=LeCunNormal(1e-3)),
            F.relu,
            L.Linear( 1024, 512, initialW=LeCunNormal(1e-3)),
            F.relu,
            L.Linear( 512, n_actions, initialW=LeCunNormal(1e-3)),
            SoftmaxDistribution),
        q=links.Sequence(
            L.Linear( obs_dim, 1024, initialW=LeCunNormal(1e-3)),
            F.relu,
            L.Linear( 1024, 512, initialW=LeCunNormal(1e-3)),
            F.relu,
            L.Linear( 512, n_actions, initialW=LeCunNormal(1e-3)),
            DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync( lr=7e-4, eps=1e-2, alpha=0.99)
    opt.setup( model )
    opt.add_hook( chainer.optimizer.GradientClipping(40) )

    replay_buffer = EpisodicReplayBuffer( 128 )
    agent = acer.ACER( model, opt, 
        gamma=0.95, # reward discount factor
        t_max=32, # update the model after this many local steps
        replay_buffer=replay_buffer,
        n_times_replay=4, # number of times experience replay is repeated for each update
        replay_start_size=64, # don't start replay unless we have this many experiences in the buffer
        disable_online_update=True, # rely only on experience buffer
        use_trust_region=True,  # enable trust region policy optimiztion
        trust_region_delta=0.1,  # a parameter for TRPO
        truncation_threshold=5.0, # truncate large importance weights
        beta=1e-2, # entropy regularization parameter
        phi= lambda obs: obs.astype(np.float32, copy=False) )

    return agent