def load_language(app, tokenizer_service, tag, model_dir):
config = Config.load(['./default.conf', './default.' + tag + '.conf', os.path.join(model_dir, 'model.conf')])
model = create_model(config)
graph = tf.Graph()
session = tf.Session(graph=graph)
with graph.as_default():
# Force everything to run on CPU, we run on single inputs so there is not much point
# on going through the GPU
with tf.device('/cpu:0'):
model.build()
loader = tf.train.Saver()
with session.as_default():
loader.restore(session, os.path.join(model_dir, 'best'))
tokenizer = Tokenizer(tokenizer_service, tag)
app.add_language(tag, LanguageContext(tag, tokenizer, session, config, model))
print('Loaded language ' + tag)
python类device()的实例源码
run_server.py 文件源码
项目:almond-nnparser
作者: Stanford-Mobisocial-IoT-Lab
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def __init__(self, config={}, device="/gpu:0"):
config = hc.Config(config)
dtype = config.dtype or "float32"
initializer = config.initializer or 'orthogonal'
orthogonal_gain = config.orthogonal_gain or 1.0
random_stddev = config.random_stddev or 0.02
self.dtype = self.parse_dtype(dtype)
self.scope_count = 0
self.description = ''
self.weights = []
self.biases = []
self.device = config.device
self.initialized = False
self._reuse = False
if initializer == 'orthogonal':
self.initializer = self.orthogonal_initializer(orthogonal_gain)
else:
self.initializer = self.random_initializer(random_stddev)
def variable_on_worker_level(name, shape, initializer):
r'''
Next we concern ourselves with graph creation.
However, before we do so we must introduce a utility function ``variable_on_worker_level()``
used to create a variable in CPU memory.
'''
# Use the /cpu:0 device on worker_device for scoped operations
if len(FLAGS.ps_hosts) == 0:
device = worker_device
else:
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster)
with tf.device(device):
# Create or get apropos variable
var = tf.get_variable(name=name, shape=shape, initializer=initializer)
return var
def create_optimizer():
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate,
beta1=FLAGS.beta1,
beta2=FLAGS.beta2,
epsilon=FLAGS.epsilon)
return optimizer
# Towers
# ======
# In order to properly make use of multiple GPU's, one must introduce new abstractions,
# not present when using a single GPU, that facilitate the multi-GPU use case.
# In particular, one must introduce a means to isolate the inference and gradient
# calculations on the various GPU's.
# The abstraction we intoduce for this purpose is called a 'tower'.
# A tower is specified by two properties:
# * **Scope** - A scope, as provided by `tf.name_scope()`,
# is a means to isolate the operations within a tower.
# For example, all operations within 'tower 0' could have their name prefixed with `tower_0/`.
# * **Device** - A hardware device, as provided by `tf.device()`,
# on which all operations within the tower execute.
# For example, all operations of 'tower 0' could execute on the first GPU `tf.device('/gpu:0')`.
def variable_on_worker_level(name, shape, initializer):
r'''
Next we concern ourselves with graph creation.
However, before we do so we must introduce a utility function ``variable_on_worker_level()``
used to create a variable in CPU memory.
'''
# Use the /cpu:0 device on worker_device for scoped operations
if len(FLAGS.ps_hosts) == 0:
device = worker_device
else:
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster)
with tf.device(device):
# Create or get apropos variable
var = tf.get_variable(name=name, shape=shape, initializer=initializer)
return var
def variable_on_worker_level(name, shape, initializer):
r'''
Next we concern ourselves with graph creation.
However, before we do so we must introduce a utility function ``variable_on_worker_level()``
used to create a variable in CPU memory.
'''
# Use the /cpu:0 device on worker_device for scoped operations
if len(FLAGS.ps_hosts) == 0:
device = worker_device
else:
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster)
with tf.device(device):
# Create or get apropos variable
var = tf.get_variable(name=name, shape=shape, initializer=initializer)
return var
def create_optimizer():
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate,
beta1=FLAGS.beta1,
beta2=FLAGS.beta2,
epsilon=FLAGS.epsilon)
return optimizer
# Towers
# ======
# In order to properly make use of multiple GPU's, one must introduce new abstractions,
# not present when using a single GPU, that facilitate the multi-GPU use case.
# In particular, one must introduce a means to isolate the inference and gradient
# calculations on the various GPU's.
# The abstraction we intoduce for this purpose is called a 'tower'.
# A tower is specified by two properties:
# * **Scope** - A scope, as provided by `tf.name_scope()`,
# is a means to isolate the operations within a tower.
# For example, all operations within 'tower 0' could have their name prefixed with `tower_0/`.
# * **Device** - A hardware device, as provided by `tf.device()`,
# on which all operations within the tower execute.
# For example, all operations of 'tower 0' could execute on the first GPU `tf.device('/gpu:0')`.
def trainable_initial_state(self, batch_size):
"""
Create a trainable initial state for the BasicLSTMCell
:param batch_size: number of samples per batch
:return: LSTMStateTuple
"""
def _create_initial_state(batch_size, state_size, trainable=True, initializer=tf.random_normal_initializer()):
with tf.device('/cpu:0'):
s = tf.get_variable('initial_state', shape=[1, state_size], dtype=tf.float32, trainable=trainable,
initializer=initializer)
state = tf.tile(s, tf.stack([batch_size] + [1]))
return state
with tf.variable_scope('initial_c'):
initial_c = _create_initial_state(batch_size, self._num_units)
with tf.variable_scope('initial_h'):
initial_h = _create_initial_state(batch_size, self._num_units)
return tf.contrib.rnn.LSTMStateTuple(initial_c, initial_h)
def assign_sub(self, delta, name=None):
"""Mimic the updates to the variable.
Args:
delta: is pushed into a staging buffer and will be pumped later.
name: currently ignored; names of ops and the StagingArea are
computed without using this pass name.
Returns:
The actual updates. The colocation constraint will be reapplied.
"""
# This parameter is ignored: the StagingArea only supports setting
# the shared name, not the names of individual ops it uses.
del name
# colocate_with(None, True) clears the colocation constraints.
# Push the delta into a staging buffer.
with ops.colocate_with(None, True), tf.device(self.var_stage_get.device):
delta_staging_area = data_flow_ops.StagingArea(
[self.var_stage_get.dtype], shapes=[self.var_stage_get.shape])
delta_put_op = delta_staging_area.put([delta])
self.variable_mgr.staging_delta_ops.append(delta_put_op)
delta_get_op = delta_staging_area.get()[0]
# Return the actual updates. The colocation constraint will be reapplied.
return self.real_var.assign_sub(delta_get_op)
def build_all_reduce_device_prefixes(job_name, num_tasks):
"""Build list of device prefix names for all_reduce.
Args:
job_name: 'worker', 'ps' or 'localhost'.
num_tasks: number of jobs across which device names should be generated.
Returns:
A list of device name prefix strings. Each element spells out the full
host name without adding the device.
e.g. '/job:worker/task:0'
"""
if job_name != 'localhost':
return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)]
else:
assert num_tasks == 1
return ['/job:%s' % job_name]
def unpack_grad_tuple(gv, gpt):
"""Unpack a previously packed collection of gradient tensors.
Args:
gv: A (grad, var) pair to be unpacked.
gpt: A GradPackTuple describing the packing operation that produced gv.
Returns:
A list of (grad, var) pairs corresponding to the values that were
originally packed into gv, maybe following subsequent operations like
reduction.
"""
elt_widths = [x.num_elements() for x in gpt.shapes]
with tf.device(gv[0][0].device):
with tf.name_scope('unpack'):
splits = tf.split(gv[0], elt_widths)
unpacked_gv = []
for idx, s in enumerate(splits):
unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx]))
return unpacked_gv
def preprocess_device_grads(self, device_grads):
"""Preprocess the device gradients prior to applying them.
Args:
device_grads: List of lists of (gradient, variable) tuples.
device_grads[t][g] = (gradient, variable), where t is the index of the
tower and g is the index of the gradient-variable pair.
Returns: a tuple of (apply_gradients_devices, gradient_state).
gradient_state is an opaque structure that should be passed to
get_gradients_to_apply() and append_apply_gradients_ops() (in that order).
apply_gradients_devices is a list of devices where the gradients will be
applied with get_gradients_to_apply() and append_apply_gradients_ops().
"""
del device_grads # unused by this implementation
assert False, 'Must be implemented in subclass'
def trainable_variables_on_device(self,
rel_device_num,
abs_device_num,
writable=False):
"""Return the set of trainable variables on device.
Args:
rel_device_num: local worker device index.
abs_device_num: global graph device index.
writable: whether to get a reference to the underlying variable.
Returns:
The set of trainable variables on the specified device.
"""
del rel_device_num, writable
if self.each_tower_has_variables():
params = [
v for v in tf.trainable_variables()
if v.name.startswith('v%s/' % abs_device_num)
]
else:
params = tf.trainable_variables()
return params
def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops,
loss_scale_params):
device_grads = gradient_state # From 2nd result of preprocess_device_grads.
def get_apply_gradients_ops_func():
"""Returns a list of ops for updating gradients."""
apply_gradients_ops = []
# For each variable, apply the combined gradients for this server on
# the parameter server, and then wait for all other servers to do this.
for i, (g, v) in enumerate(grads):
apply_gradient_op = opt.apply_gradients([(g, v)])
barrier = self.benchmark_cnn.add_sync_queues_and_barrier(
'replicate_variable_%s' % i, [apply_gradient_op])
with tf.control_dependencies([barrier]):
with tf.device(self.benchmark_cnn.cpu_device):
updated_value = v.read_value()
for my_d in range(len(self.benchmark_cnn.devices)):
apply_gradients_ops.append(
device_grads[my_d][i][1].assign(updated_value))
return apply_gradients_ops
variable_mgr_util.append_gradients_with_loss_scale(
training_ops, get_apply_gradients_ops_func, loss_scale_params,
self.grad_has_inf_nan)
def fix_variables(self, sess, pretrained_model):
print('Fix VGG16 layers..')
with tf.variable_scope('Fix_VGG16') as scope:
with tf.device("/cpu:0"):
# fix the vgg16 issue from conv weights to fc weights
# fix RGB to BGR
fc6_conv = tf.get_variable("fc6_conv", [7, 7, 512, 4096], trainable=False)
fc7_conv = tf.get_variable("fc7_conv", [1, 1, 4096, 4096], trainable=False)
conv1_rgb = tf.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False)
restorer_fc = tf.train.Saver({self._scope + "/fc6/weights": fc6_conv,
self._scope + "/fc7/weights": fc7_conv,
self._scope + "/conv1/conv1_1/weights": conv1_rgb})
restorer_fc.restore(sess, pretrained_model)
sess.run(tf.assign(self._variables_to_fix[self._scope + '/fc6/weights:0'], tf.reshape(fc6_conv,
self._variables_to_fix[self._scope + '/fc6/weights:0'].get_shape())))
sess.run(tf.assign(self._variables_to_fix[self._scope + '/fc7/weights:0'], tf.reshape(fc7_conv,
self._variables_to_fix[self._scope + '/fc7/weights:0'].get_shape())))
sess.run(tf.assign(self._variables_to_fix[self._scope + '/conv1/conv1_1/weights:0'],
tf.reverse(conv1_rgb, [2])))
def run_bilateral_slice_apply(self, dev, grid_data, guide_data, input_data, has_offset=False):
with tf.device(dev):
grid_tensor = tf.convert_to_tensor(
grid_data, name='grid', dtype=tf.float32)
guide_tensor = tf.convert_to_tensor(
guide_data, name='guide', dtype=tf.float32)
input_tensor = tf.convert_to_tensor(
input_data, name='input', dtype=tf.float32)
output_tensor = ops.bilateral_slice_apply(grid_tensor, guide_tensor, input_tensor, has_offset=has_offset)
with self.test_session() as sess:
output_data = sess.run(output_tensor)
return output_data
def __init__(self,
num_parameter_servers=0,
ps_device='/job:ps',
placement='CPU:0'):
"""Initialize VariableDeviceChooser.
Args:
num_parameter_servers: number of parameter servers.
ps_device: string representing the parameter server device.
placement: string representing the placement of the variable either CPU:0
or GPU:0. When using parameter servers forced to CPU:0.
"""
self._num_ps = num_parameter_servers
self._ps_device = ps_device
self._placement = placement if num_parameter_servers == 0 else 'CPU:0'
self._next_task_id = 0
def global_step(device=''):
"""Returns the global step variable.
Args:
device: Optional device to place the variable. It can be an string or a
function that is called to get the device for the variable.
Returns:
the tensor representing the global step variable.
"""
global_step_ref = tf.get_collection(tf.GraphKeys.GLOBAL_STEP)
if global_step_ref:
return global_step_ref[0]
else:
collections = [
VARIABLES_TO_RESTORE,
tf.GraphKeys.VARIABLES,
tf.GraphKeys.GLOBAL_STEP,
]
# Get the device for the variable.
with tf.device(variable_device(device, 'global_step')):
return tf.get_variable('global_step', shape=[], dtype=tf.int64,
initializer=tf.zeros_initializer,
trainable=False, collections=collections)
def build_all(self, param_avg=False):
"""Build all nodes."""
if self._has_built_all:
raise Exception('Only call build_all or build_eval once.')
self._has_built_all = True
with tf.device(self.get_device_fn()):
with tf.variable_scope(self.name):
inp_var = self.build_input()
output_var = self.build(inp_var)
loss_var = self.build_loss(inp_var, output_var)
train_step = self.build_optim(loss_var)
if param_avg:
ema_op, avg_var = self.get_average_var()
self._avg_var = avg_var
with tf.control_dependencies([train_step, ema_op]):
train_step = tf.no_op(name='train_step')
self.register_var('train_step', train_step)
return self
def load_new_model(sess, restore_path, nlayers, device='/cpu:0'):
from resnet_imagenet_model import ResNetImageNetModel
with tf.device(device):
logger = tfplus.utils.logger.get()
with logger.verbose_level(2):
resnet = ResNetImageNetModel().set_all_options({
'inp_depth': 3,
'layers': get_layers(nlayers),
'strides': [1, 2, 2, 2],
'channels': [64, 256, 512, 1024, 2048],
'bottleneck': True,
'shortcut': 'projection',
'compatible': True,
'weight_decay': 1e-4,
'subtract_mean': True,
'trainable': False
})
inp_var = resnet.build_input()
out_var = resnet.build(inp_var)
out_var2 = resnet.build(inp_var)
saver = tf.train.Saver(resnet.get_save_var_dict())
saver.restore(sess, restore_path)
return resnet, inp_var, out_var, out_var2
def load_wrapper_model(sess, restore_path, nlayers, device='/cpu:0'):
from resnet_imagenet_model_wrapper import ResNetImageNetModelWrapper
with tf.device(device):
logger = tfplus.utils.logger.get()
with logger.verbose_level(2):
resnet = ResNetImageNetModelWrapper().set_all_options({
'inp_depth': 3,
'layers': get_layers(nlayers),
'strides': [1, 2, 2, 2],
'channels': [64, 256, 512, 1024, 2048],
'bottleneck': True,
'shortcut': 'projection',
'compatible': True,
'wd': 1e-4,
'subtract_mean': True,
'trainable': False
})
inp_var = resnet.build_input()
out_var = resnet.build(inp_var)
saver = tf.train.Saver(resnet.res_net.get_save_var_dict())
saver.restore(sess, restore_path)
return resnet.res_net, inp_var, out_var['y_out']
def build_input(self):
results = {}
phase_train = self.add_input_var('phase_train', None, 'bool')
results['phase_train'] = phase_train
inp_depth = self.get_option('inp_depth')
orig_x = []
for ii in xrange(self.num_replica):
with tf.name_scope('%s_%d' % ('replica', ii)) as scope:
device = '/gpu:{}'.format(ii)
with tf.device(device):
x_ = self.add_input_var('x_{}'.format(
ii), [None, None, None, inp_depth], 'float')
results['x_{}'.format(ii)] = x_
y_gt_ = self.add_input_var('y_gt_{}'.format(ii), [
None, NUM_CLS], 'float')
results['y_gt_{}'.format(ii)] = y_gt_
orig_x.append(
(x_ + self.sub_models[0].res_net._img_mean) / 255.0)
# self.log.error(x_.device)
# self.log.fatal('')
self.register_var('orig_x', tf.concat(0, orig_x))
return results
def build(self, inp):
# Divide input equally.
self.lazy_init_var()
inp_list = []
output = []
for ii in xrange(self.num_replica):
with tf.name_scope('%s_%d' % ('replica', ii)) as scope:
device = '/gpu:{}'.format(ii)
with tf.device(device):
tf.get_variable_scope().reuse_variables()
inp_ = {
'x': inp['x_{}'.format(ii)],
'y_gt': inp['y_gt_{}'.format(ii)],
'phase_train': inp['phase_train']
}
output.append(self.sub_models[ii].build(inp_))
inp_list.append(inp_)
self.output_list = output
self.input_list = inp_list
output = tf.concat(0, [oo['y_out'] for oo in output])
self.register_var('y_out', output)
output2 = tf.concat(0, [mm.get_var('score_out')
for mm in self.sub_models])
self.register_var('score_out', output2)
return {'y_out': output}
def build_optim(self, loss):
global_step = self.global_step
learn_rate = self.learn_rate
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = self.average_gradients(self.tower_grads)
# Apply the gradients to adjust the shared variables.
apply_gradient_op = self.opt.apply_gradients(
grads, global_step=global_step)
# Track the moving averages of all trainable variables.
variable_averages = tf.train.ExponentialMovingAverage(
0.999, global_step)
variables_averages_op = variable_averages.apply(
tf.trainable_variables())
# Group all updates to into a single train op.
train_op = tf.group(apply_gradient_op, variables_averages_op)
# for m in self.sub_models:
# self.log.info(m.device)
# self.log.fatal('haha')
return train_op
def __init__(self, hps, gpu_mode=True, reuse=False):
"""Initializer for the SketchRNN model.
Args:
hps: a HParams object containing model hyperparameters
gpu_mode: a boolean that when True, uses GPU mode.
reuse: a boolean that when true, attemps to reuse variables.
"""
self.hps = hps
with tf.variable_scope('vector_rnn', reuse=reuse):
if not gpu_mode:
with tf.device('/cpu:0'):
tf.logging.info('Model using cpu.')
self.build_model(hps)
else:
tf.logging.info('Model using gpu.')
self.build_model(hps)
def get_device_setter(num_parameter_servers, num_workers):
"""
Get a device setter given number of servers in the cluster.
Given the numbers of parameter servers and workers, construct a device
setter object using ClusterSpec.
Args:
num_parameter_servers: Number of parameter servers
num_workers: Number of workers
Returns:
Device setter object.
"""
ps_hosts = re.findall(r'[\w\.:]+', FLAGS.ps_hosts) # split address
worker_hosts = re.findall(r'[\w\.:]+', FLAGS.worker_hosts) # split address
assert num_parameter_servers == len(ps_hosts)
assert num_workers == len(worker_hosts)
cluster_spec = tf.train.ClusterSpec({"ps":ps_hosts,"worker":worker_hosts})
# Get device setter from the cluster spec #
return tf.train.replica_device_setter(cluster=cluster_spec)
def main():
# Graph
with tf.device('/cpu:0'):
a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
c=a+b
target = tf.constant(100.,shape=[2],dtype=tf.float32)
loss = tf.reduce_mean(tf.square(c-target))
opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss)
# Session
#sv = tf.train.Supervisor(logdir='/tmp/mydir')
sv = tf.train.Supervisor(logdir='/tmp/mydir')
gpu_options = tf.GPUOptions(allow_growth=True,allocator_type="BFC",visible_device_list="%d"%FLAGS.gpu_id)
config = tf.ConfigProto(gpu_options=gpu_options,allow_soft_placement=False,device_count={'GPU':1},log_device_placement=True)
sess = sv.prepare_or_wait_for_session(config=config)
for i in range(1000):
sess.run(opt)
if i % 10 == 0:
r = sess.run(c)
print(r)
time.sleep(.1)
def main():
# Graph
with tf.device('/cpu:0'):
a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
c=a+b
target = tf.constant(100.,shape=[2],dtype=tf.float32)
loss = tf.reduce_mean(tf.square(c-target))
opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss)
# Session
sv = tf.train.Supervisor()
sess = sv.prepare_or_wait_for_session()
for i in range(1000):
sess.run(opt)
if i % 10 == 0:
r = sess.run(c)
print(r)
time.sleep(.1)
def fused_birnn(fused_rnn, inputs, sequence_length, initial_state=(None, None), dtype=None, scope=None,
time_major=False, backward_device=None):
with tf.variable_scope(scope or "BiRNN"):
sequence_length = tf.cast(sequence_length, tf.int32)
if not time_major:
inputs = tf.transpose(inputs, [1, 0, 2])
outputs_fw, state_fw = fused_rnn(inputs, sequence_length=sequence_length, initial_state=initial_state[0],
dtype=dtype, scope="FW")
if backward_device is not None:
with tf.device(backward_device):
outputs_bw, state_bw = fused_rnn_backward(fused_rnn, inputs, sequence_length, initial_state[1], dtype,
scope="BW")
else:
outputs_bw, state_bw = fused_rnn_backward(fused_rnn, inputs, sequence_length, initial_state[1], dtype,
scope="BW")
if not time_major:
outputs_fw = tf.transpose(outputs_fw, [1, 0, 2])
outputs_bw = tf.transpose(outputs_bw, [1, 0, 2])
return (outputs_fw, outputs_bw), (state_fw, state_bw)
def testPS(self):
deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1)
self.assertDeviceEqual(deploy_config.clone_device(0),
'/job:worker')
self.assertEqual(deploy_config.clone_scope(0), '')
self.assertDeviceEqual(deploy_config.optimizer_device(),
'/job:worker/device:CPU:0')
self.assertDeviceEqual(deploy_config.inputs_device(),
'/job:worker/device:CPU:0')
with tf.device(deploy_config.variables_device()):
a = tf.Variable(0)
b = tf.Variable(0)
c = tf.no_op()
d = slim.variable('a', [],
caching_device=deploy_config.caching_device())
self.assertDeviceEqual(a.device, '/job:ps/task:0/device:CPU:0')
self.assertDeviceEqual(a.device, a.value().device)
self.assertDeviceEqual(b.device, '/job:ps/task:0/device:CPU:0')
self.assertDeviceEqual(b.device, b.value().device)
self.assertDeviceEqual(c.device, '')
self.assertDeviceEqual(d.device, '/job:ps/task:0/device:CPU:0')
self.assertDeviceEqual(d.value().device, '')