def run(self,
epochs,
steps,
api_key,
rollouts_per_epoch = 20,
updateTargetNetwork = defaultRunSettings['updateTargetNetwork'],
explorationRate = defaultRunSettings['explorationRate'],
miniBatchSize = defaultRunSettings['miniBatchSize'],
learnStart = defaultRunSettings['learnStart'],
renderPerXEpochs = defaultRunSettings['renderPerXEpochs'],
shouldRender = defaultRunSettings['shouldRender'],
experimentId = defaultRunSettings['experimentId'],
force = defaultRunSettings['force'],
upload = defaultRunSettings['upload']):
last100Scores = [0] * 100
last100ScoresIndex = 0
last100Filled = False
stepCounter = 0
if not experimentId == None:
self.env.monitor.start('tmp/'+experimentId, force = force)
for epoch in xrange(epochs):
I = 1
observation = self.env.reset();
for t in xrange(steps):
policyValues = self.runModel(self.policyModel, observation)
action = self.selectActionByProbability(policyValues)
newObservation, reward, done, info = self.env.step(action)
cost, grads = self.get_cost_grads(self.policyModel);
print (theano.pp(grads[1][0]));
if done:
delta = reward + self.discountFactor * self.runModel(self.valueModel, newObservation) - self.runModel(self.valueModel, observation)
else :
delta = reward - self.runModel(self.valueModel, observation) # because the value for new obs is 0
self.env.monitor.close()
if upload:
gym.upload('/tmp/'+experimentId, api_key=api_key)
评论列表
文章目录