def create_model(env, args):
h = x = Input(shape=(None,) + env.observation_space.shape, name="x")
# policy network
for i in range(args.layers):
h = TimeDistributed(Dense(args.hidden_size, activation=args.activation), name="h%d" % (i + 1))(h)
p = TimeDistributed(Dense(env.action_space.n, activation='softmax'), name="p")(h)
# baseline network
h = TimeDistributed(Dense(args.hidden_size, activation=args.activation), name="hb")(h)
b = TimeDistributed(Dense(1), name="b")(h)
# advantage is additional input
A = Input(shape=(None,))
# policy gradient loss and entropy bonus
def policy_gradient_loss(l_sampled, l_predicted):
return K.mean(A * categorical_crossentropy(l_sampled, l_predicted), axis=1) \
- args.beta * K.mean(categorical_crossentropy(l_predicted, l_predicted), axis=1)
# inputs to the model are observation and total reward,
# outputs are action probabilities and baseline
model = Model(input=[x, A], output=[p, b])
# baseline is optimized with MSE
model.compile(optimizer=args.optimizer, loss=[policy_gradient_loss, 'mse'])
model.optimizer.lr = args.optimizer_lr
return model
评论列表
文章目录