def test(args, env, agent):
if args.record:
if 'env' in vars(args):
env = wrappers.Monitor(env, './videos/' + args.env + str(time()) + '/')
else:
env = wrappers.Monitor(env, './videos/' + str(time()) + '/')
test_rewards = []
test_start = time()
test_steps = 0
for iteration in range(1, 1 + args.n_test_iter):
state = env.reset()
iter_rewards = 0.0
done = False
while not done:
test_steps += 1
action, _ = agent.forward(state)
state, reward, done, _ = env.step(action)
iter_rewards += reward
test_rewards.append(iter_rewards)
print_stats('Test', test_rewards, args.n_test_iter,
time() - test_start, test_steps, 0, agent)
return test_rewards
python类Monitor()的实例源码
def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None):
assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder'
assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple'
if isinstance(repeat_action, int):
assert repeat_action >= 1, "repeat_action should be >= 1"
elif isinstance(repeat_action, tuple):
assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)'
assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]'
super(GymEnvironment, self).__init__()
self._state_builder = state_builder
self._env = gym.make(env_name)
self._env.env.frameskip = repeat_action
self._no_op = max(0, no_op)
self._done = True
if monitoring_path is not None:
self._env = Monitor(self._env, monitoring_path, video_callable=need_record)
def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None):
assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder'
assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple'
if isinstance(repeat_action, int):
assert repeat_action >= 1, "repeat_action should be >= 1"
elif isinstance(repeat_action, tuple):
assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)'
assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]'
super(GymEnvironment, self).__init__()
self._state_builder = state_builder
self._env = gym.make(env_name)
self._env.env.frameskip = repeat_action
self._no_op = max(0, no_op)
self._done = True
if monitoring_path is not None:
self._env = Monitor(self._env, monitoring_path, video_callable=need_record)
def __init__(self, master, thread_id, clip_gradients=True):
super(A3CThread, self).__init__(name=thread_id)
self.thread_id = thread_id
self.clip_gradients = clip_gradients
self.env = make_environment(master.env_name)
self.master = master
self.config = master.config
if thread_id == 0 and self.master.monitor:
self.env = wrappers.Monitor(self.env, master.monitor_path, force=True, video_callable=(None if self.master.video else False))
# Only used (and overwritten) by agents that use an RNN
self.initial_features = None
# Build actor and critic networks
with tf.variable_scope("t{}_net".format(self.thread_id)):
self.action, self.value, self.actor_states, self.critic_states, self.actions_taken, self.losses, self.adv, self.r, self.n_steps = self.build_networks()
self.sync_net = self.create_sync_net_op()
inc_step = self.master.global_step.assign_add(self.n_steps)
self.train_op = tf.group(self.make_trainer(), inc_step)
# Write the summary of each thread in a different directory
self.writer = tf.summary.FileWriter(os.path.join(self.master.monitor_path, "thread" + str(self.thread_id)), self.master.session.graph)
self.runner = RunnerThread(self.env, self, 20, thread_id == 0 and self.master.video)
def __init__(self, env, monitor_path, video=True, **usercfg):
super(SarsaFA, self).__init__()
self.env = env
self.env = wrappers.Monitor(self.env, monitor_path, force=True, video_callable=(None if video else False))
m = usercfg.get("m", 10) # Number of tilings
self.config = dict(
m=m,
n_x_tiles=9,
n_y_tiles=9,
Lambda=0.9,
epsilon=0, # fully greedy in this case
alpha=(0.05 * (0.5 / m)),
gamma=1,
n_iter=1000,
steps_per_episode=env.spec.tags.get("wrapper_config.TimeLimit.max_episode_steps") # Maximum number of allowed steps per episode, as determined (for this environment) by the gym library
)
self.config.update(usercfg)
O = env.observation_space
self.x_low, self.y_low = O.low
self.x_high, self.y_high = O.high
self.nA = env.action_space.n
self.policy = EGreedy(self.config["epsilon"])
self.function_approximation = TileCoding(self.x_low, self.x_high, self.y_low, self.y_high, m, self.config["n_x_tiles"], self.config["n_y_tiles"], self.nA)
def __init__(self, env, monitor_path, video=True, **usercfg):
super(Karpathy, self).__init__(**usercfg)
self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False))
self.nA = self.env.action_space.n
# Default configuration. Can be overwritten using keyword arguments.
self.config.update(dict(
# timesteps_per_batch=10000,
# n_iter=100,
episode_max_length=env.spec.tags.get("wrapper_config.TimeLimit.max_episode_steps"),
gamma=0.99,
learning_rate=0.05,
batch_size=10, # Amount of episodes after which to adapt gradients
decay_rate=0.99, # Used for RMSProp
n_hidden_units=20,
draw_frequency=50, # Draw a plot every 50 episodes
repeat_n_actions=1
))
self.config.update(usercfg)
self.build_network()
def __init__(self, env, monitor_path, video=True, **usercfg):
super(KarpathyCNN, self).__init__(**usercfg)
self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False))
self.nA = env.action_space.n
self.monitor_path = monitor_path
# Default configuration. Can be overwritten using keyword arguments.
self.config.update(
dict(
# timesteps_per_batch=10000,
# n_iter=100,
n_hidden_units=200,
learning_rate=1e-3,
batch_size=10, # Amount of episodes after which to adapt gradients
gamma=0.99, # Discount past rewards by a percentage
decay=0.99, # Decay of RMSProp optimizer
epsilon=1e-9, # Epsilon of RMSProp optimizer
draw_frequency=50 # Draw a plot every 50 episodes
)
)
self.config.update(usercfg)
self.build_network()
if self.config["save_model"]:
tf.add_to_collection("action", self.action)
tf.add_to_collection("states", self.states)
self.saver = tf.train.Saver()
def test_env_reuse():
with helpers.tempdir() as temp:
env = gym.make('Autoreset-v0')
env = Monitor(env, temp)
env.reset()
_, _, done, _ = env.step(None)
assert not done
_, _, done, _ = env.step(None)
assert done
_, _, done, _ = env.step(None)
assert not done
_, _, done, _ = env.step(None)
assert done
env.close()
def test_steps_limit_restart():
with helpers.tempdir() as temp:
env = gym.make('test.StepsLimitCartpole-v0')
env = Monitor(env, temp, video_callable=False)
env.reset()
# Episode has started
_, _, done, info = env.step(env.action_space.sample())
assert done == False
# Limit reached, now we get a done signal and the env resets itself
_, _, done, info = env.step(env.action_space.sample())
assert done == True
assert env.episode_id == 1
env.close()
def main():
episodeCount = 20
stepsPerEpisode = 100
env = gym.make("CartPole-v0")
env = wrappers.Monitor(env, "/tmp/cartpole-experiment-1")
for episode in range(episodeCount):
observation = env.reset()
for t in range(stepsPerEpisode):
env.render()
print(observation)
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
if done:
print("Episode finished after {} timesteps".format(t + 1))
break
def test_env_reuse():
with helpers.tempdir() as temp:
env = gym.make('Autoreset-v0')
env = Monitor(env, temp)
env.reset()
_, _, done, _ = env.step(None)
assert not done
_, _, done, _ = env.step(None)
assert done
_, _, done, _ = env.step(None)
assert not done
_, _, done, _ = env.step(None)
assert done
env.close()
def test_steps_limit_restart():
with helpers.tempdir() as temp:
env = gym.make('test.StepsLimitCartpole-v0')
env = Monitor(env, temp, video_callable=False)
env.reset()
# Episode has started
_, _, done, info = env.step(env.action_space.sample())
assert done == False
# Limit reached, now we get a done signal and the env resets itself
_, _, done, info = env.step(env.action_space.sample())
assert done == True
assert env.episode_id == 1
env.close()
def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None):
assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder'
assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple'
if isinstance(repeat_action, int):
assert repeat_action >= 1, "repeat_action should be >= 1"
elif isinstance(repeat_action, tuple):
assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)'
assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]'
super(GymEnvironment, self).__init__()
self._state_builder = state_builder
self._env = gym.make(env_name)
self._env.env.frameskip = repeat_action
self._no_op = max(0, no_op)
self._done = True
if monitoring_path is not None:
self._env = Monitor(self._env, monitoring_path, video_callable=need_record)
def main(env_id, policy_file, record, stochastic, extra_kwargs):
import gym
from gym import wrappers
import tensorflow as tf
from es_distributed.policies import MujocoPolicy
import numpy as np
env = gym.make(env_id)
if record:
import uuid
env = wrappers.Monitor(env, '/tmp/' + str(uuid.uuid4()), force=True)
if extra_kwargs:
import json
extra_kwargs = json.loads(extra_kwargs)
with tf.Session():
pi = MujocoPolicy.Load(policy_file, extra_kwargs=extra_kwargs)
while True:
rews, t = pi.rollout(env, render=True, random_stream=np.random if stochastic else None)
print('return={:.4f} len={}'.format(rews.sum(), t))
if record:
env.close()
return
def _create_env(self, monitor_dir, record_freq=None, max_episode_steps=None,
**kwargs):
monitor_path = os.path.join(self.log_dir, monitor_dir)
env = gym.make(self.env_name)
if max_episode_steps is not None:
env._max_episode_steps = max_episode_steps
monitored_env = wrappers.Monitor(
env=env,
directory=monitor_path,
resume=True,
video_callable=lambda x: record_freq is not None and x % record_freq == True)
if self.env_wrapper is not None:
env = self.env_wrapper.wrap_env(monitored_env)
else:
env = monitored_env
return monitored_env, env
def __init__(self, name,
log_dir,
obs_f = None,
reward_f = None,
clamp_actions = False,
monitor = False):
self._env = gym.make(name)
self.log_dir = log_dir
if monitor:
self._env = wrappers.Monitor(self._env, log_dir, force=True)
self.obs_f = obs_f
self.reward_f = reward_f
self.clamp_actions = clamp_actions
self.monitor = monitor
def get_env(seed):
env = gym.make('Pong-ram-v0')
set_global_seeds(seed)
env.seed(seed)
expt_dir = '/tmp/hw3_vid_dir/'
env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
env = wrap_deepmind_ram(env)
return env
def get_env(env_id, seed):
env = gym.make(env_id)
set_global_seeds(seed)
env.seed(seed)
expt_dir = './tmp/hw3_vid_dir2/'
env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
env = wrap_deepmind(env)
return env
def get_custom_env(env_id, seed):
env = gym.make(env_id)
set_global_seeds(seed)
env.seed(seed)
expt_dir = './tmp/hw3_vid_dir2/'
env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
env = wrap_custom(env)
return env
def get_env():
env = gym.make('nesgym/NekketsuSoccerPK-v0')
env = nesgym.wrap_nes_env(env)
expt_dir = '/tmp/soccer/'
env = wrappers.Monitor(env, os.path.join(expt_dir, "gym"), force=True)
return env
def get_env(seed):
env = gym.make('Pong-ram-v0')
set_global_seeds(seed)
env.seed(seed)
expt_dir = '/tmp/hw3_vid_dir/'
env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
env = wrap_deepmind_ram(env)
return env
def get_env(task, seed):
env_id = task.env_id
env = gym.make(env_id)
set_global_seeds(seed)
env.seed(seed)
expt_dir = '/tmp/hw3_vid_dir2/'
env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
env = wrap_deepmind(env)
return env
def test(self, just_one=True):
""" This is for test-time evaluation. No training is done here. By
default, iterate through every snapshot. If `just_one` is true, this
only runs one set of weights, to ensure that we record right away since
OpenAI will only record subsets and less frequently. Changing the loop
over snapshots is also needed.
"""
os.makedirs(self.args.directory+'/videos')
self.env = wrappers.Monitor(self.env, self.args.directory+'/videos', force=True)
headdir = self.args.directory+'/snapshots/'
snapshots = os.listdir(headdir)
snapshots.sort()
num_rollouts = 10
if just_one:
num_rollouts = 1
for sn in snapshots:
print("\n***** Currently on snapshot {} *****".format(sn))
### Add your own criteria here.
# if "800" not in sn:
# continue
###
with open(headdir+sn, 'rb') as f:
weights = pickle.load(f)
self.sess.run(self.set_params_op,
feed_dict={self.new_weights_v: weights})
returns = []
for i in range(num_rollouts):
returns.append( self._compute_return(test=True) )
print("mean: \t{}".format(np.mean(returns)))
print("std: \t{}".format(np.std(returns)))
print("max: \t{}".format(np.max(returns)))
print("min: \t{}".format(np.min(returns)))
print("returns:\n{}".format(returns))
def main():
args = parser.parse_args()
env = make_environment(args.environment)
runner = ModelRunner(env, args.model_directory, args.save_directory, n_iter=args.iterations)
try:
runner.env = wrappers.Monitor(runner.env, args.save_directory, video_callable=False, force=True)
runner.run()
except KeyboardInterrupt:
pass
def __init__(self, env, monitor_path, video=True, **usercfg):
super(A2C, self).__init__(**usercfg)
self.monitor_path = monitor_path
self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False))
self.env_runner = EnvRunner(self.env, self, usercfg)
self.config.update(dict(
timesteps_per_batch=10000,
trajectories_per_batch=10,
batch_update="timesteps",
n_iter=100,
gamma=0.99,
actor_learning_rate=0.01,
critic_learning_rate=0.05,
actor_n_hidden=20,
critic_n_hidden=20,
repeat_n_actions=1,
save_model=False
))
self.config.update(usercfg)
self.build_networks()
init = tf.global_variables_initializer()
# Launch the graph.
self.session = tf.Session()
self.session.run(init)
if self.config["save_model"]:
tf.add_to_collection("action", self.action)
tf.add_to_collection("states", self.states)
self.saver = tf.train.Saver()
self.rewards = tf.placeholder("float", name="Rewards")
self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
summary_actor_loss = tf.summary.scalar("Actor_loss", self.summary_actor_loss)
summary_critic_loss = tf.summary.scalar("Critic_loss", self.summary_critic_loss)
summary_rewards = tf.summary.scalar("Rewards", self.rewards)
summary_episode_lengths = tf.summary.scalar("Episode_lengths", self.episode_lengths)
self.summary_op = tf.summary.merge([summary_actor_loss, summary_critic_loss, summary_rewards, summary_episode_lengths])
self.writer = tf.summary.FileWriter(os.path.join(self.monitor_path, "summaries"), self.session.graph)
return
def __init__(self, env, monitor_path, video=True, **usercfg):
super(REINFORCE, self).__init__(**usercfg)
self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False))
self.env_runner = EnvRunner(self.env, self, usercfg)
self.monitor_path = monitor_path
# Default configuration. Can be overwritten using keyword arguments.
self.config.update(dict(
batch_update="timesteps",
timesteps_per_batch=1000,
n_iter=100,
gamma=0.99, # Discount past rewards by a percentage
decay=0.9, # Decay of RMSProp optimizer
epsilon=1e-9, # Epsilon of RMSProp optimizer
learning_rate=0.05,
n_hidden_units=20,
repeat_n_actions=1,
save_model=False
))
self.config.update(usercfg)
self.build_network()
self.make_trainer()
init = tf.global_variables_initializer()
# Launch the graph.
self.session = tf.Session()
self.session.run(init)
if self.config["save_model"]:
tf.add_to_collection("action", self.action)
tf.add_to_collection("states", self.states)
self.saver = tf.train.Saver()
self.rewards = tf.placeholder("float", name="Rewards")
self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
summary_loss = tf.summary.scalar("Loss", self.summary_loss)
summary_rewards = tf.summary.scalar("Rewards", self.rewards)
summary_episode_lengths = tf.summary.scalar("Episode_lengths", self.episode_lengths)
self.summary_op = tf.summary.merge([summary_loss, summary_rewards, summary_episode_lengths])
self.writer = tf.summary.FileWriter(os.path.join(self.monitor_path, "task0"), self.session.graph)
def test_no_double_wrapping():
temp = tempfile.mkdtemp()
try:
env = gym.make("FrozenLake-v0")
env = wrappers.Monitor(env, temp)
try:
env = wrappers.Monitor(env, temp)
except error.DoubleWrapperError:
pass
else:
assert False, "Should not allow double wrapping"
env.close()
finally:
shutil.rmtree(temp)
def test():
benchmark = registration.Benchmark(
id='MyBenchmark-v0',
scorer=scoring.ClipTo01ThenAverage(),
tasks=[
{'env_id': 'CartPole-v0',
'trials': 1,
'max_timesteps': 5
},
{'env_id': 'CartPole-v0',
'trials': 1,
'max_timesteps': 100,
}])
with helpers.tempdir() as temp:
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, directory=temp, video_callable=False)
env.seed(0)
env.set_monitor_mode('evaluation')
rollout(env)
env.set_monitor_mode('training')
for i in range(2):
rollout(env)
env.set_monitor_mode('evaluation')
rollout(env, good=True)
env.close()
results = monitoring.load_results(temp)
evaluation_score = benchmark.score_evaluation('CartPole-v0', results['data_sources'], results['initial_reset_timestamps'], results['episode_lengths'], results['episode_rewards'], results['episode_types'], results['timestamps'])
benchmark_score = benchmark.score_benchmark({
'CartPole-v0': evaluation_score['scores'],
})
assert np.all(np.isclose(evaluation_score['scores'], [0.00089999999999999998, 0.0054000000000000003])), "evaluation_score={}".format(evaluation_score)
assert np.isclose(benchmark_score, 0.00315), "benchmark_score={}".format(benchmark_score)
def test_monitor_filename():
with helpers.tempdir() as temp:
env = gym.make('CartPole-v0')
env = Monitor(env, directory=temp)
env.close()
manifests = glob.glob(os.path.join(temp, '*.manifest.*'))
assert len(manifests) == 1
def test_write_upon_reset_false():
with helpers.tempdir() as temp:
env = gym.make('CartPole-v0')
env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=False)
env.reset()
files = glob.glob(os.path.join(temp, '*'))
assert not files, "Files: {}".format(files)
env.close()
files = glob.glob(os.path.join(temp, '*'))
assert len(files) > 0