def main(argv=None):
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
with tf.device('/gpu:2'):
real_data, z, opt_g, opt_d = build_graph()
summary_op = tf.merge_all_summaries()
saver = tf.train.Saver()
npad = ((0, 0), (2, 2), (2, 2))
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, sess.graph)
for i in xrange(FLAGS.max_iter_step):
train_img = mnist.train.next_batch(FLAGS.batch_size)[0]
train_img = np.reshape(train_img, (-1, 28, 28))
train_img = np.pad(train_img, pad_width=npad,
mode='constant', constant_values=0)
train_img = np.expand_dims(train_img, -1)
batch_z = np.random.normal(0, 1.0, [FLAGS.batch_size, FLAGS.z_dim]) \
.astype(np.float32)
feed_dict = {real_data: train_img, z: batch_z}
if i % 100 == 99:
run_options = tf.RunOptions(
trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
_, merged = sess.run([opt_g, summary_op], feed_dict=feed_dict,
options=run_options, run_metadata=run_metadata)
_, merged = sess.run([opt_g, summary_op], feed_dict=feed_dict,
options=run_options, run_metadata=run_metadata)
summary_writer.add_summary(merged, i)
summary_writer.add_run_metadata(
run_metadata, 'generator_metadata{}'.format(i), i)
_, merged = sess.run([opt_d, summary_op], feed_dict=feed_dict,
options=run_options, run_metadata=run_metadata)
summary_writer.add_summary(merged, i)
summary_writer.add_run_metadata(
run_metadata, 'discriminator_metadata{}'.format(i), i)
else:
sess.run(opt_g, feed_dict=feed_dict)
sess.run(opt_d, feed_dict=feed_dict)
if i % 1000 == 999:
saver.save(sess, os.path.join(
FLAGS.ckpt_dir, "model.ckpt"), global_step=i)
python类RunMetadata()的实例源码
def train(session):
batch_size = 200
session.run(tf.global_variables_initializer())
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # (*)
run_metadata = tf.RunMetadata()
# Training cycle
for epoch in range(10):
epoch_loss = 0.0
batch_steps = mnist.train.num_examples / batch_size
for step in range(batch_steps):
batch_x, batch_y = mnist.train.next_batch(batch_size)
_, c = session.run(
[train_op, loss],
feed_dict={x: batch_x, y: batch_y},
options=run_options, run_metadata=run_metadata # (*)
)
epoch_loss += c / batch_steps
print "[%s] Epoch %02d, Loss = %.6f" % (datetime.now(), epoch, epoch_loss)
# Dump profiling data (*)
prof_timeline = tf.python.client.timeline.Timeline(run_metadata.step_stats)
prof_ctf = prof_timeline.generate_chrome_trace_format()
with open('./prof_ctf.json', 'w') as fp:
print 'Dumped to prof_ctf.json'
fp.write(prof_ctf)
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print "Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})
def run_training_batch(self, session, batch):
"""
A batch contains input tensors for words, pos, lemmas, preds,
preds_idx, and labels (in that order)
Runs the model on the batch (through train_op if train=True)
Returns the loss
"""
feed_dict = self.batch_to_feed(batch)
feed_dict[self.use_dropout_placeholder] = 1.0
fetches = [self.loss, self.train_op]
# options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
# run_metadata = tf.RunMetadata()
loss, _ = session.run(fetches, feed_dict=feed_dict)
# loss, _ = session.run(fetches,
# feed_dict=feed_dict,
# options=options,
# run_metadata=run_metadata)
# fetched_timeline = timeline.Timeline(run_metadata.step_stats)
# chrome_trace = fetched_timeline.generate_chrome_trace_format()
# with open('timeline.json', 'w') as f:
# f.write(chrome_trace)
return loss
def run_training_batch(self, session, batch):
"""
A batch contains input tensors for words, pos, lemmas, preds,
preds_idx, and labels (in that order)
Runs the model on the batch (through train_op if train=True)
Returns the loss
"""
feed_dict = self.batch_to_feed(batch)
feed_dict[self.use_dropout_placeholder] = 1.0
fetches = [self.loss, self.train_op]
# options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
# run_metadata = tf.RunMetadata()
loss, _ = session.run(fetches, feed_dict=feed_dict)
# loss, _ = session.run(fetches,
# feed_dict=feed_dict,
# options=options,
# run_metadata=run_metadata)
# fetched_timeline = timeline.Timeline(run_metadata.step_stats)
# chrome_trace = fetched_timeline.generate_chrome_trace_format()
# with open('timeline.json', 'w') as f:
# f.write(chrome_trace)
return loss
def run_step(self):
""" Simply run self.train_op"""
self.sess.run(self.train_op)
#run_metadata = tf.RunMetadata()
#self.sess.run([self.train_op],
#options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
#run_metadata=run_metadata
#)
#from tensorflow.python.client import timeline
#trace = timeline.Timeline(step_stats=run_metadata.step_stats)
#trace_file = open('timeline.ctf.json', 'w')
#trace_file.write(trace.generate_chrome_trace_format())
#import sys; sys.exit()
def trace(config, sess, model, train_data):
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
X, Q, Y = random_batch(*train_data, config.batch_size)
model.batch_fit(X, Q, Y, learning_rate, run_options, run_metadata)
train_writer.add_run_metadata(run_metadata, 'step%d' % step)
from tensorflow.python.client import timeline
tl = timeline.Timeline(run_metadata.step_stats)
ctf = tl.generate_chrome_trace_format()
with open('timeline.json', 'w') as f:
f.write(ctf)
return
def benchmark_one_step(sess,
fetches,
step,
batch_size,
step_train_times,
trace_filename,
image_producer,
params,
summary_op=None):
"""Advance one step of benchmarking."""
if trace_filename and step == -1:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
else:
run_options = None
run_metadata = None
summary_str = None
start_time = time.time()
if summary_op is None:
results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
else:
(results, summary_str) = sess.run(
[fetches, summary_op], options=run_options, run_metadata=run_metadata)
if not params.forward_only:
lossval = results['total_loss']
else:
lossval = 0.
image_producer.notify_image_consumption()
train_time = time.time() - start_time
step_train_times.append(train_time)
if step >= 0 and (step == 0 or (step + 1) % params.display_every == 0):
log_str = '%i\t%s\t%.3f' % (
step + 1, get_perf_timing_str(batch_size, step_train_times), lossval)
if 'top_1_accuracy' in results:
log_str += '\t%.3f\t%.3f' % (results['top_1_accuracy'],
results['top_5_accuracy'])
log_fn(log_str)
if trace_filename and step == -1:
log_fn('Dumping trace to %s' % trace_filename)
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
with gfile.Open(trace_filename, 'w') as trace_file:
trace_file.write(trace.generate_chrome_trace_format(show_memory=True))
return summary_str
def train(self, nIter, machine=None, summary_op=None):
# Xh = self._validate(machine=machine, n=10)
run_metadata = tf.RunMetadata()
# summary_op = tf.summary.merge_all()
sv = tf.train.Supervisor(
logdir=self.dirs['logdir'],
# summary_writer=summary_writer,
# summary_op=None,
# is_chief=True,
# save_model_secs=600,
global_step=self.opt['global_step'])
# sess_config = configure_gpu_settings(args.gpu_cfg)
sess_config = tf.ConfigProto(
allow_soft_placement=True,
gpu_options=tf.GPUOptions(allow_growth=True))
with sv.managed_session(config=sess_config) as sess:
sv.loop(60, self._refresh_status, (sess,))
for step in range(self.arch['training']['max_iter']):
if sv.should_stop():
break
# main loop
for _ in range(self.arch['training']['nIterD']):
sess.run(self.opt['d'])
sess.run(self.opt['g'])
# # output img
# if step % 1000 == 0:
# xh = sess.run(Xh)
# with tf.gfile.GFile(
# os.path.join(
# self.dirs['logdir'],
# 'img-anime-{:03d}k.png'.format(step // 1000),
# ),
# mode='wb',
# ) as fp:
# fp.write(xh)
def train(self, nIter, machine=None, summary_op=None):
Xh = self._validate(machine=machine, n=10)
run_metadata = tf.RunMetadata()
# summary_op = tf.summary.merge_all()
sv = tf.train.Supervisor(
logdir=self.dirs['logdir'],
# summary_writer=summary_writer,
# summary_op=None,
# is_chief=True,
# save_model_secs=600,
global_step=self.opt['global_step'])
# sess_config = configure_gpu_settings(args.gpu_cfg)
sess_config = tf.ConfigProto(
allow_soft_placement=True,
gpu_options=tf.GPUOptions(allow_growth=True))
with sv.managed_session(config=sess_config) as sess:
sv.loop(60, self._refresh_status, (sess,))
for step in range(self.arch['training']['max_iter']):
if sv.should_stop():
break
# main loop
for _ in range(self.arch['training']['nIterD']):
sess.run(self.opt['d'])
sess.run(self.opt['g'])
# output img
if step % 1000 == 0:
xh = sess.run(Xh)
with tf.gfile.GFile(
os.path.join(
self.dirs['logdir'],
'img-anime-{:03d}k.png'.format(step // 1000),
),
mode='wb',
) as fp:
fp.write(xh)
def load_data(self, sess, inputs, full_trace=False):
"""Bulk loads the specified inputs into device memory.
The shape of the inputs must conform to the shapes of the input
placeholders this optimizer was constructed with.
The data is split equally across all the devices. If the data is not
evenly divisible by the batch size, excess data will be discarded.
Args:
sess: TensorFlow session.
inputs: List of Tensors matching the input placeholders specified
at construction time of this optimizer.
full_trace: Whether to profile data loading.
Returns:
The number of tuples loaded per device.
"""
feed_dict = {}
assert len(self.input_placeholders) == len(inputs)
for ph, arr in zip(self.input_placeholders, inputs):
truncated_arr = make_divisible_by(arr, self.batch_size)
feed_dict[ph] = truncated_arr
truncated_len = len(truncated_arr)
if full_trace:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
else:
run_options = tf.RunOptions(trace_level=tf.RunOptions.NO_TRACE)
run_metadata = tf.RunMetadata()
sess.run(
[t.init_op for t in self._towers],
feed_dict=feed_dict,
options=run_options,
run_metadata=run_metadata)
if full_trace:
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
trace_file = open(os.path.join(self.logdir, "timeline-load.json"),
"w")
trace_file.write(trace.generate_chrome_trace_format())
tuples_per_device = truncated_len / len(self.devices)
assert tuples_per_device > 0, \
"Too few tuples per batch, trying increasing the training " \
"batch size or decreasing the sgd batch size. Tried to split up " \
"{} rows {}-ways in batches of {} (total across devices).".format(
len(arr), len(self.devices), self.batch_size)
assert tuples_per_device % self.per_device_batch_size == 0
return tuples_per_device
def optimize(self, sess, batch_index, extra_ops=[], extra_feed_dict={},
file_writer=None):
"""Run a single step of SGD.
Runs a SGD step over a slice of the preloaded batch with size given by
self.per_device_batch_size and offset given by the batch_index
argument.
Updates shared model weights based on the averaged per-device
gradients.
Args:
sess: TensorFlow session.
batch_index: Offset into the preloaded data. This value must be
between `0` and `tuples_per_device`. The amount of data to
process is always fixed to `per_device_batch_size`.
extra_ops: Extra ops to run with this step (e.g. for metrics).
extra_feed_dict: Extra args to feed into this session run.
file_writer: If specified, tf metrics will be written out using
this.
Returns:
The outputs of extra_ops evaluated over the batch.
"""
if file_writer:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
else:
run_options = tf.RunOptions(trace_level=tf.RunOptions.NO_TRACE)
run_metadata = tf.RunMetadata()
feed_dict = {self._batch_index: batch_index}
feed_dict.update(extra_feed_dict)
outs = sess.run(
[self._train_op] + extra_ops,
feed_dict=feed_dict,
options=run_options,
run_metadata=run_metadata)
if file_writer:
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
trace_file = open(os.path.join(self.logdir, "timeline-sgd.json"),
"w")
trace_file.write(trace.generate_chrome_trace_format())
file_writer.add_run_metadata(
run_metadata, "sgd_train_{}".format(batch_index))
return outs[1:]
def main():
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
with tf.device('/gpu:1'):
g_loss_sum, d_loss_sum, img_sum, opt_g, opt_d, z, real_data = build_graph()
summary_g = tf.merge_summary([g_loss_sum, img_sum])
summary_d = tf.merge_summary([d_loss_sum, img_sum])
saver = tf.train.Saver()
npad = ((0, 0), (2, 2), (2, 2))
with tf.Session(config=tf.ConfigProto(
allow_soft_placement=True)) as sess:
sess.run(tf.initialize_all_variables())
summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, sess.graph)
for i in xrange(FLAGS.max_iter_step):
train_data = mnist.train.next_batch(FLAGS.batch_size)
train_img = np.reshape(train_data[0], (-1, 28, 28))
train_img = np.pad(train_img, pad_width=npad,
mode='constant', constant_values=0)
train_img = np.expand_dims(train_img, -1)
batch_z = np.random.uniform(-1, 1, [FLAGS.batch_size, FLAGS.z_dim]) \
.astype(np.float32)
feed_dict = {real_data[0]: train_img, z: batch_z, real_data[1]:train_data[1]}
if i % 100 == 99:
run_options = tf.RunOptions(
trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
_, merged = sess.run([opt_g, summary_g], feed_dict=feed_dict,
options=run_options, run_metadata=run_metadata)
summary_writer.add_summary(merged, i)
summary_writer.add_run_metadata(
run_metadata, 'generator_metadata {}'.format(i), i)
_, merged = sess.run([opt_g, summary_g], feed_dict=feed_dict,
options=run_options, run_metadata=run_metadata)
summary_writer.add_summary(merged, i)
summary_writer.add_run_metadata(
run_metadata, 'second_generator_metadata {}'.format(i), i)
_, merged = sess.run([opt_d, summary_d], feed_dict=feed_dict,
options=run_options, run_metadata=run_metadata)
summary_writer.add_summary(merged, i)
summary_writer.add_run_metadata(
run_metadata, 'discriminator_metadata {}'.format(i), i)
else:
sess.run(opt_g, feed_dict=feed_dict)
sess.run(opt_g, feed_dict=feed_dict)
sess.run(opt_d, feed_dict=feed_dict)
if i % 1000 == 999:
saver.save(sess, os.path.join(
FLAGS.ckpt_dir, "model.ckpt"), global_step=i)
def train():
batch_size = 10
print "Starting ABC-CNN training"
vqa = dl.load_questions_answers('data')
# Create subset of data for over-fitting
sub_vqa = {}
sub_vqa['training'] = vqa['training'][:10]
sub_vqa['validation'] = vqa['validation'][:10]
sub_vqa['answer_vocab'] = vqa['answer_vocab']
sub_vqa['question_vocab'] = vqa['question_vocab']
sub_vqa['max_question_length'] = vqa['max_question_length']
train_size = len(vqa['training'])
max_itr = (train_size // batch_size) * 10
with tf.Session() as sess:
image, ques, ans, optimizer, loss, accuracy = abc.model(sess, batch_size)
print "Defined ABC model"
train_loader = util.get_batch(sess, vqa, batch_size, 'training')
print "Created train dataset generator"
valid_loader = util.get_batch(sess, vqa, batch_size, 'validation')
print "Created validation dataset generator"
writer = abc.write_tensorboard(sess)
init = tf.global_variables_initializer()
merged = tf.summary.merge_all()
sess.run(init)
print "Initialized Tensor variables"
itr = 1
while itr < max_itr:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
_, vgg_batch, ques_batch, answer_batch = train_loader.next()
_, valid_vgg_batch, valid_ques_batch, valid_answer_batch = valid_loader.next()
sess.run(optimizer, feed_dict={image: vgg_batch, ques: ques_batch, ans: answer_batch})
[train_summary, train_loss, train_accuracy] = sess.run([merged, loss, accuracy],
feed_dict={image: vgg_batch, ques: ques_batch, ans: answer_batch},
options=run_options,
run_metadata=run_metadata)
[valid_loss, valid_accuracy] = sess.run([loss, accuracy],
feed_dict={image: valid_vgg_batch,
ques: valid_ques_batch,
ans: valid_answer_batch})
writer.add_run_metadata(run_metadata, 'step%03d' % itr)
writer.add_summary(train_summary, itr)
writer.flush()
print "Iteration:%d\tTraining Loss:%f\tTraining Accuracy:%f\tValidation Loss:%f\tValidation Accuracy:%f"%(
itr, train_loss, 100.*train_accuracy, valid_loss, 100.*valid_accuracy)
itr += 1
def train_step(sess, train_op, global_step, train_step_kwargs):
"""Function that takes a gradient step and specifies whether to stop.
Args:
sess: The current session.
train_op: A dictionary of `Operation` that evaluates the gradients and returns the
total loss (for first) in case of iter_size > 1.
global_step: A `Tensor` representing the global training step.
train_step_kwargs: A dictionary of keyword arguments.
Returns:
The total loss and a boolean indicating whether or not to stop training.
"""
start_time = time.time()
if FLAGS.iter_size == 1:
# for debugging specific endpoint values,
# set the train file to one image and use
# pdb here
# import pdb
# pdb.set_trace()
if FLAGS.profile_iterations:
run_options = tf.RunOptions(
trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
total_loss, np_global_step = sess.run([train_op, global_step],
options=run_options,
run_metadata=run_metadata)
tl = timeline.Timeline(run_metadata.step_stats)
ctf = tl.generate_chrome_trace_format()
with open(os.path.join(FLAGS.train_dir,
'timeline_%08d.json' % np_global_step), 'w') as f:
f.write(ctf)
else:
total_loss, np_global_step = sess.run([train_op, global_step])
else:
for j in range(FLAGS.iter_size-1):
sess.run([train_op[j]])
total_loss, np_global_step = sess.run(
[train_op[FLAGS.iter_size-1], global_step])
time_elapsed = time.time() - start_time
if 'should_log' in train_step_kwargs:
if sess.run(train_step_kwargs['should_log']):
logging.info('%s: global step %d: loss = %.4f (%.2f sec)',
datetime.now(), np_global_step, total_loss, time_elapsed)
if 'should_stop' in train_step_kwargs:
should_stop = sess.run(train_step_kwargs['should_stop'])
else:
should_stop = False
return total_loss, should_stop