def add_training_op(self, loss):
#optimizer = tf.train.AdamOptimizer(self.config.lr)
#optimizer = tf.train.AdagradOptimizer(self.config.lr)
optclass = getattr(tf.train, self.config.optimizer + 'Optimizer')
assert issubclass(optclass, tf.train.Optimizer)
optimizer = optclass(self.config.learning_rate)
gradient_var_pairs = optimizer.compute_gradients(loss)
vars = [x[1] for x in gradient_var_pairs]
gradients = [x[0] for x in gradient_var_pairs]
if self.config.gradient_clip > 0:
clipped, _ = tf.clip_by_global_norm(gradients, self.config.gradient_clip)
else:
clipped = gradients
self.grad_norm = tf.global_norm(clipped)
train_op = optimizer.apply_gradients(zip(clipped, vars))
return train_op
python类train()的实例源码
base_aligner.py 文件源码
项目:almond-nnparser
作者: Stanford-Mobisocial-IoT-Lab
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def start_server_if_distributed(self):
"""Starts a server if the execution is distributed."""
if self.cluster:
logging.info("%s: Starting trainer within cluster %s.",
task_as_string(self.task), self.cluster.as_dict())
server = start_server(self.cluster, self.task)
target = server.target
device_fn = tf.train.replica_device_setter(
ps_device="/job:ps",
worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
cluster=self.cluster)
else:
target = ""
device_fn = ""
return (target, device_fn)
def get_meta_filename(self, start_new_model, train_dir):
if start_new_model:
logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
task_as_string(self.task))
return None
latest_checkpoint = tf.train.latest_checkpoint(train_dir)
if not latest_checkpoint:
logging.info("%s: No checkpoint file found. Building a new model.",
task_as_string(self.task))
return None
meta_filename = latest_checkpoint + ".meta"
if not gfile.Exists(meta_filename):
logging.info("%s: No meta graph file found. Building a new model.",
task_as_string(self.task))
return None
else:
return meta_filename
def start_server(cluster, task):
"""Creates a Server.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
if not task.type:
raise ValueError("%s: The task type must be specified." %
task_as_string(task))
if task.index is None:
raise ValueError("%s: The task index must be specified." %
task_as_string(task))
# Create and start a server.
return tf.train.Server(
tf.train.ClusterSpec(cluster),
protocol="grpc",
job_name=task.type,
task_index=task.index)
def start_server_if_distributed(self):
"""Starts a server if the execution is distributed."""
if self.cluster:
logging.info("%s: Starting trainer within cluster %s.",
task_as_string(self.task), self.cluster.as_dict())
server = start_server(self.cluster, self.task)
target = server.target
device_fn = tf.train.replica_device_setter(
ps_device="/job:ps",
worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
cluster=self.cluster)
else:
target = ""
device_fn = ""
return (target, device_fn)
def get_meta_filename(self, start_new_model, train_dir):
if start_new_model:
logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
task_as_string(self.task))
return None
latest_checkpoint = tf.train.latest_checkpoint(train_dir)
if not latest_checkpoint:
logging.info("%s: No checkpoint file found. Building a new model.",
task_as_string(self.task))
return None
meta_filename = latest_checkpoint + ".meta"
if not gfile.Exists(meta_filename):
logging.info("%s: No meta graph file found. Building a new model.",
task_as_string(self.task))
return None
else:
return meta_filename
def start_server(cluster, task):
"""Creates a Server.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
if not task.type:
raise ValueError("%s: The task type must be specified." %
task_as_string(task))
if task.index is None:
raise ValueError("%s: The task index must be specified." %
task_as_string(task))
# Create and start a server.
return tf.train.Server(
tf.train.ClusterSpec(cluster),
protocol="grpc",
job_name=task.type,
task_index=task.index)
def __init__(self, cluster, task, train_dir, log_device_placement=True):
""""Creates a Trainer.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
self.cluster = cluster
self.task = task
self.is_master = (task.type == "master" and task.index == 0)
self.train_dir = train_dir
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu)
self.config = tf.ConfigProto(log_device_placement=log_device_placement)
if self.is_master and self.task.index > 0:
raise StandardError("%s: Only one replica of master expected",
task_as_string(self.task))
def start_server_if_distributed(self):
"""Starts a server if the execution is distributed."""
if self.cluster:
logging.info("%s: Starting trainer within cluster %s.",
task_as_string(self.task), self.cluster.as_dict())
server = start_server(self.cluster, self.task)
target = server.target
device_fn = tf.train.replica_device_setter(
ps_device="/job:ps",
worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
cluster=self.cluster)
else:
target = ""
device_fn = ""
return (target, device_fn)
def get_meta_filename(self, start_new_model, train_dir):
if start_new_model:
logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
task_as_string(self.task))
return None
latest_checkpoint = tf.train.latest_checkpoint(train_dir)
if not latest_checkpoint:
logging.info("%s: No checkpoint file found. Building a new model.",
task_as_string(self.task))
return None
meta_filename = latest_checkpoint + ".meta"
if not gfile.Exists(meta_filename):
logging.info("%s: No meta graph file found. Building a new model.",
task_as_string(self.task))
return None
else:
return meta_filename
def start_server(cluster, task):
"""Creates a Server.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
if not task.type:
raise ValueError("%s: The task type must be specified." %
task_as_string(task))
if task.index is None:
raise ValueError("%s: The task index must be specified." %
task_as_string(task))
# Create and start a server.
return tf.train.Server(
tf.train.ClusterSpec(cluster),
protocol="grpc",
job_name=task.type,
task_index=task.index)
def __init__(self, cluster, task, train_dir, log_device_placement=True):
""""Creates a Trainer.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
self.cluster = cluster
self.task = task
self.is_master = (task.type == "master" and task.index == 0)
self.train_dir = train_dir
self.config = tf.ConfigProto(log_device_placement=log_device_placement)
if self.is_master and self.task.index > 0:
raise StandardError("%s: Only one replica of master expected",
task_as_string(self.task))
def start_server_if_distributed(self):
"""Starts a server if the execution is distributed."""
if self.cluster:
logging.info("%s: Starting trainer within cluster %s.",
task_as_string(self.task), self.cluster.as_dict())
server = start_server(self.cluster, self.task)
target = server.target
device_fn = tf.train.replica_device_setter(
ps_device="/job:ps",
worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
cluster=self.cluster)
else:
target = ""
device_fn = ""
return (target, device_fn)
def get_meta_filename(self, start_new_model, train_dir):
if start_new_model:
logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
task_as_string(self.task))
return None
latest_checkpoint = tf.train.latest_checkpoint(train_dir)
if not latest_checkpoint:
logging.info("%s: No checkpoint file found. Building a new model.",
task_as_string(self.task))
return None
meta_filename = latest_checkpoint + ".meta"
if not gfile.Exists(meta_filename):
logging.info("%s: No meta graph file found. Building a new model.",
task_as_string(self.task))
return None
else:
return meta_filename
def start_server(cluster, task):
"""Creates a Server.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
if not task.type:
raise ValueError("%s: The task type must be specified." %
task_as_string(task))
if task.index is None:
raise ValueError("%s: The task index must be specified." %
task_as_string(task))
# Create and start a server.
return tf.train.Server(
tf.train.ClusterSpec(cluster),
protocol="grpc",
job_name=task.type,
task_index=task.index)
def start_server_if_distributed(self):
"""Starts a server if the execution is distributed."""
if self.cluster:
logging.info("%s: Starting trainer within cluster %s.",
task_as_string(self.task), self.cluster.as_dict())
server = start_server(self.cluster, self.task)
target = server.target
device_fn = tf.train.replica_device_setter(
ps_device="/job:ps",
worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
cluster=self.cluster)
else:
target = ""
device_fn = ""
return (target, device_fn)
def get_meta_filename(self, start_new_model, train_dir):
if start_new_model:
logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
task_as_string(self.task))
return None
latest_checkpoint = tf.train.latest_checkpoint(train_dir)
if not latest_checkpoint:
logging.info("%s: No checkpoint file found. Building a new model.",
task_as_string(self.task))
return None
meta_filename = latest_checkpoint + ".meta"
if not gfile.Exists(meta_filename):
logging.info("%s: No meta graph file found. Building a new model.",
task_as_string(self.task))
return None
else:
return meta_filename
def start_server(cluster, task):
"""Creates a Server.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
if not task.type:
raise ValueError("%s: The task type must be specified." %
task_as_string(task))
if task.index is None:
raise ValueError("%s: The task index must be specified." %
task_as_string(task))
# Create and start a server.
return tf.train.Server(
tf.train.ClusterSpec(cluster),
protocol="grpc",
job_name=task.type,
task_index=task.index)
def __init__(self, cluster, task, train_dir, log_device_placement=True):
""""Creates a Trainer.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
self.cluster = cluster
self.task = task
self.is_master = (task.type == "master" and task.index == 0)
self.train_dir = train_dir
self.config = tf.ConfigProto(log_device_placement=log_device_placement)
if self.is_master and self.task.index > 0:
raise StandardError("%s: Only one replica of master expected",
task_as_string(self.task))
def get_meta_filename(self, start_new_model, train_dir):
if start_new_model:
logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
task_as_string(self.task))
return None
latest_checkpoint = tf.train.latest_checkpoint(train_dir)
if not latest_checkpoint:
logging.info("%s: No checkpoint file found. Building a new model.",
task_as_string(self.task))
return None
meta_filename = latest_checkpoint + ".meta"
if not gfile.Exists(meta_filename):
logging.info("%s: No meta graph file found. Building a new model.",
task_as_string(self.task))
return None
else:
return meta_filename
def __init__(self, cluster, task, train_dir, log_device_placement=True):
""""Creates a Trainer.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
self.cluster = cluster
self.task = task
self.is_master = (task.type == "master" and task.index == 0)
self.train_dir = train_dir
self.config = tf.ConfigProto(log_device_placement=log_device_placement)
if self.is_master and self.task.index > 0:
raise StandardError("%s: Only one replica of master expected",
task_as_string(self.task))
def start_server_if_distributed(self):
"""Starts a server if the execution is distributed."""
if self.cluster:
logging.info("%s: Starting trainer within cluster %s.",
task_as_string(self.task), self.cluster.as_dict())
server = start_server(self.cluster, self.task)
target = server.target
device_fn = tf.train.replica_device_setter(
ps_device="/job:ps",
worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
cluster=self.cluster)
else:
target = ""
device_fn = ""
return (target, device_fn)
def start_server(cluster, task):
"""Creates a Server.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
if not task.type:
raise ValueError("%s: The task type must be specified." %
task_as_string(task))
if task.index is None:
raise ValueError("%s: The task index must be specified." %
task_as_string(task))
# Create and start a server.
return tf.train.Server(
tf.train.ClusterSpec(cluster),
protocol="grpc",
job_name=task.type,
task_index=task.index)
def __init__(self, cluster, task, train_dir, log_device_placement=True):
""""Creates a Trainer.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
self.cluster = cluster
self.task = task
self.is_master = (task.type == "master" and task.index == 0)
self.train_dir = train_dir
self.config = tf.ConfigProto(log_device_placement=log_device_placement)
if self.is_master and self.task.index > 0:
raise StandardError("%s: Only one replica of master expected",
task_as_string(self.task))
def start_server_if_distributed(self):
"""Starts a server if the execution is distributed."""
if self.cluster:
logging.info("%s: Starting trainer within cluster %s.",
task_as_string(self.task), self.cluster.as_dict())
server = start_server(self.cluster, self.task)
target = server.target
device_fn = tf.train.replica_device_setter(
ps_device="/job:ps",
worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
cluster=self.cluster)
else:
target = ""
device_fn = ""
return (target, device_fn)
def start_server(cluster, task):
"""Creates a Server.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
if not task.type:
raise ValueError("%s: The task type must be specified." %
task_as_string(task))
if task.index is None:
raise ValueError("%s: The task index must be specified." %
task_as_string(task))
# Create and start a server.
return tf.train.Server(
tf.train.ClusterSpec(cluster),
protocol="grpc",
job_name=task.type,
task_index=task.index)
def get_input_data_tensors(reader,
data_pattern,
batch_size=256,
num_epochs=None):
logging.info("Using batch size of " + str(batch_size) + " for training.")
with tf.name_scope("train_input"):
files = gfile.Glob(data_pattern)
if not files:
raise IOError("Unable to find training files. data_pattern='" +
data_pattern + "'.")
logging.info("Number of training files: %s.", str(len(files)))
files.sort()
filename_queue = tf.train.string_input_producer(
files, num_epochs=num_epochs, shuffle=False)
training_data = reader.prepare_reader(filename_queue)
return tf.train.batch(
training_data,
batch_size=batch_size,
capacity=FLAGS.batch_size * 4,
allow_smaller_final_batch=True,
enqueue_many=True)
def __init__(self, cluster, task, train_dir, log_device_placement=True):
""""Creates a Trainer.
Args:
cluster: A tf.train.ClusterSpec if the execution is distributed.
None otherwise.
task: A TaskSpec describing the job type and the task index.
"""
self.cluster = cluster
self.task = task
self.is_master = (task.type == "master" and task.index == 0)
self.train_dir = train_dir
self.config = tf.ConfigProto(log_device_placement=log_device_placement)
if self.is_master and self.task.index > 0:
raise StandardError("%s: Only one replica of master expected",
task_as_string(self.task))
def start_server_if_distributed(self):
"""Starts a server if the execution is distributed."""
if self.cluster:
logging.info("%s: Starting trainer within cluster %s.",
task_as_string(self.task), self.cluster.as_dict())
server = start_server(self.cluster, self.task)
target = server.target
device_fn = tf.train.replica_device_setter(
ps_device="/job:ps",
worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
cluster=self.cluster)
else:
target = ""
device_fn = ""
return (target, device_fn)
def get_meta_filename(self, start_new_model, train_dir):
if start_new_model:
logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
task_as_string(self.task))
return None
latest_checkpoint = tf.train.latest_checkpoint(train_dir)
if not latest_checkpoint:
logging.info("%s: No checkpoint file found. Building a new model.",
task_as_string(self.task))
return None
meta_filename = latest_checkpoint + ".meta"
if not gfile.Exists(meta_filename):
logging.info("%s: No meta graph file found. Building a new model.",
task_as_string(self.task))
return None
else:
return meta_filename