def gradient_clip(gradients, max_gradient_norm):
"""Clipping gradients of a model."""
clipped_gradients, gradient_norm = tf.clip_by_global_norm(
gradients, max_gradient_norm)
gradient_norm_summary = [tf.summary.scalar("grad_norm", gradient_norm)]
gradient_norm_summary.append(
tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_gradients)))
return clipped_gradients, gradient_norm_summary, gradient_norm
python类global_norm()的实例源码
def summarize_gradients(model_name, gradients):
""" Adds histograms for gradients and gradient norms of the input
gradients """
def get_prefix(var):
return model_name + '/' + var.name
for gradient, variable in gradients:
if gradient is not None:
tf.summary.histogram(get_prefix(variable) + "/gradients", gradient)
tf.summary.histogram(get_prefix(variable) + "/gradient_norm",
tf.global_norm([gradient]))
def test_stable_global_norm_avoids_overflow(self):
tensors = [tf.ones([4]), tf.ones([4, 4]) * 1e19, None]
gnorm_is_inf = tf.is_inf(tf.global_norm(tensors))
stable_gnorm_is_inf = tf.is_inf(
tfgan_losses._numerically_stable_global_norm(tensors))
with self.test_session(use_gpu=True):
self.assertTrue(gnorm_is_inf.eval())
self.assertFalse(stable_gnorm_is_inf.eval())
def test_stable_global_norm_unchanged(self):
"""Test that preconditioning doesn't change global norm value."""
tf.set_random_seed(1234)
tensors = [tf.random_uniform(
[3] * i, -10.0, 10.0) for i in range(6)]
gnorm = tf.global_norm(tensors)
precond_gnorm = tfgan_losses._numerically_stable_global_norm(tensors)
with self.test_session(use_gpu=True) as sess:
# spot check closeness on more than one sample.
for _ in range(10):
gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm])
self.assertNear(gnorm_np, precond_gnorm_np, 1e-5)
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
"""Clips gradients of a multitask loss by their global norm.
Ignores all-zero tensors when computing the global norm.
Args:
gradients_variables: a list of pairs (gradient, variable).
clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.
Returns:
list: A list of pairs of the same type as gradients_variables,.
fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
"""
gradients, variables = six.moves.zip(*gradients_variables)
def _replace_nonexisting_grad(grad):
if grad is None:
return grad
all_zeros = _is_all_zeros(grad)
return tf.cond(
all_zeros,
lambda: tf.zeros([], dtype=tf.as_dtype(grad.dtype)),
lambda: grad)
nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
fixed_global_norm = tf.global_norm(nonzero_gradients)
gradients, _ = tf.clip_by_global_norm(
gradients, clip_norm, use_norm=fixed_global_norm)
return list(six.moves.zip(gradients, variables)), fixed_global_norm
def _adaptive_gradient_clipping(self, grads_and_vars, std_factor=2., decay=0.95, static_max_norm=None, global_step=None, epsilon=1e-8, name=None):
"""function for adaptive gradient clipping."""
grads, variables = zip(*grads_and_vars)
norm = tf.global_norm(grads)
max_norm, log_mean = self._adaptive_max_norm(norm, std_factor, decay,
global_step, epsilon, name)
# factor will be 1. if norm is smaller than max_norm
factor = tf.where(norm < max_norm,
tf.ones_like(norm),
tf.exp(log_mean) / norm)
if static_max_norm is not None:
factor = tf.minimum(static_max_norm / norm, factor)
# apply factor
clipped_grads = []
for grad in grads:
if grad is None:
clipped_grads.append(None)
elif isinstance(grad, tf.IndexedSlices):
clipped_grads.append(tf.IndexedSlices(grad.values * factor, grad.indices,
grad.dense_shape))
else:
clipped_grads.append(grad * factor)
return list(zip(clipped_grads, variables))
def _create_train(self):
with tf.variable_scope(self.scope):
self.actions = tf.placeholder(
shape=[None, self.action_size], dtype=tf.float32,
name='actions')
self.target_v = tf.placeholder(
shape=[None], dtype=tf.float32, name='target_v')
self.advantages = tf.placeholder(
shape=[None], dtype=tf.float32, name='advantages')
# Determine the policy loss using the actions and the advantage
log_prob = self.normal_dist.log_prob(self.actions)
exp_v = tf.transpose(
tf.multiply(tf.transpose(log_prob), self.advantages))
entropy = self.normal_dist.entropy()
exp_v = 0.01 * entropy + exp_v
self.policy_loss = tf.reduce_sum(-exp_v)
self.value_loss = 0.5 * tf.reduce_sum(
tf.square(self.target_v - tf.reshape(self.value, [-1])))
self.loss = 0.5*self.value_loss + self.policy_loss
local_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
self.gradients = tf.gradients(self.loss, local_vars)
self.var_norms = tf.global_norm(local_vars)
grads, self.grad_norms = tf.clip_by_global_norm(
self.gradients, 40.0)
global_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
self.apply_grads = self.trainer.apply_gradients(
zip(grads, global_vars))
def _add_gradients_summaries(grads_and_vars):
"""Add histogram summaries to gradients.
Note: The summaries are also added to the SUMMARIES collection.
Args:
grads_and_vars: A list of gradient to variable pairs (tuples).
Returns:
The _list_ of the added summaries for grads_and_vars.
"""
summaries = []
for grad, var in grads_and_vars:
if grad is not None:
if isinstance(grad, tf.IndexedSlices):
grad_values = grad.values
else:
grad_values = grad
summaries.append(
tf.histogram_summary(var.op.name + ':gradient', grad_values))
summaries.append(
tf.histogram_summary(var.op.name + ':gradient_norm',
tf.global_norm([grad_values])))
else:
tf.logging.info('Var %s has no gradient', var.op.name)
return summaries
def _backward(self, loss, summaries=False):
hps = self.hps
loss = loss * hps.num_steps
emb_vars = find_trainable_variables("emb")
lstm_vars = find_trainable_variables("LSTM")
softmax_vars = find_trainable_variables("softmax")
all_vars = emb_vars + lstm_vars + softmax_vars
grads = tf.gradients(loss, all_vars)
orig_grads = grads[:]
emb_grads = grads[:len(emb_vars)]
grads = grads[len(emb_vars):]
for i in range(len(emb_grads)):
assert isinstance(emb_grads[i], tf.IndexedSlices)
emb_grads[i] = tf.IndexedSlices(emb_grads[i].values * hps.batch_size, emb_grads[i].indices,
emb_grads[i].dense_shape)
lstm_grads = grads[:len(lstm_vars)]
softmax_grads = grads[len(lstm_vars):]
lstm_grads, lstm_norm = tf.clip_by_global_norm(lstm_grads, hps.max_grad_norm)
clipped_grads = emb_grads + lstm_grads + softmax_grads
assert len(clipped_grads) == len(orig_grads)
if summaries:
tf.summary.scalar("model/lstm_grad_norm", lstm_norm)
tf.summary.scalar("model/lstm_grad_scale", tf.minimum(hps.max_grad_norm / lstm_norm, 1.0))
tf.summary.scalar("model/lstm_weight_norm", tf.global_norm(lstm_vars))
# for v, g, cg in zip(all_vars, orig_grads, clipped_grads):
# name = v.name.lstrip("model/")
# tf.histogram_summary(name + "/var", v)
# tf.histogram_summary(name + "/grad", g)
# tf.histogram_summary(name + "/clipped_grad", cg)
return list(zip(clipped_grads, all_vars))
def gradient_clip(gradients, params, max_gradient_norm):
"""Clipping gradients of a model."""
clipped_gradients, gradient_norm = tf.clip_by_global_norm(
gradients, max_gradient_norm)
gradient_norm_summary = [tf.summary.scalar("grad_norm", gradient_norm)]
gradient_norm_summary.append(
tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_gradients)))
return clipped_gradients, gradient_norm_summary
def get_gradients(self, loss_or_grads, params):
"""
Note
----
The returned gradients may contain None value
"""
# check valid algorithm
if self.algorithm is None or \
not hasattr(self.algorithm, 'compute_gradients') or \
not hasattr(self.algorithm, 'apply_gradients'):
raise RuntimeError("Optimizer is None, or doesn't has attributes: "
"compute_gradients and apply_gradients.")
with tf.variable_scope(self.name):
# get the gradient
grads_var = self.algorithm.compute_gradients(loss_or_grads,
var_list=params)
grads_var = {g: v for g, v in grads_var if g is not None}
grads = list(grads_var.keys())
params = list(grads_var.values())
# ====== clipnorm ====== #
if self.clipnorm is not None:
if self.clip_alg == 'norm':
grads = [tf.clip_by_norm(g, self.clipnorm)
for g in grads]
elif self.clip_alg == 'total_norm':
grads, _ = tf.clip_by_global_norm(grads, self.clipnorm)
elif self.clip_alg == 'avg_norm':
grads = [tf.clip_by_average_norm(g, self.clipnorm)
for g in grads]
# ====== clipvalue ====== #
if self.clipvalue is not None:
grads = [tf.clip_by_value(g, -self.clipvalue, self.clipvalue)
for g in grads]
# ====== get final norm value ====== #
self._norm = add_role(tf.global_norm(grads, name="GradientNorm"),
GradientsNorm)
return [(g, p) for g, p in zip(grads, params)]
def initialize(self):
if self.summarize:
bs = tf.to_float(tf.shape(self.x)[0])
tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
tf.summary.scalar("model/entropy", self.entropy / bs)
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
self.summary_op = tf.summary.merge_all()
self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
self.variables = ray.experimental.TensorFlowVariables(self.loss,
self.sess)
self.sess.run(tf.global_variables_initializer())
def build_summary(self):
bs = tf.to_float(tf.shape(self.local_network.x)[0])
tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
tf.summary.scalar("model/entropy", self.entropy / bs)
tf.summary.image("model/state", self.local_network.x)
tf.summary.scalar("model/grad_global_norm", tf.global_norm(self.grads))
tf.summary.scalar("model/var_global_norm", tf.global_norm(self.local_network.var_list))
tf.summary.scalar("model/lr", self.lr)
self.summary_op = tf.summary.merge_all()
def _make_training_op(self):
if self.config.optimizer == 'sgd':
self.learning_rate = tf.cond(
self.global_step < self.config.start_decay_step,
lambda: tf.constant(self.config.learning_rate),
lambda: tf.train.exponential_decay(
self.config.learning_rate,
(self.global_step - self.config.start_decay_step),
self.config.decay_steps,
self.config.decay_factor,
staircase=True),
name='learning_rate')
optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
elif self.config.optimizer == 'adam':
assert self.config.learning_rate < 0.007
self.learning_rate = tf.constant(self.config.learning_rate)
optimizer = tf.train.AdamOptimizer(self.learning_rate)
params = tf.trainable_variables()
gradients = tf.gradients(self.loss, params)
clipped_gradients, gradient_norm = tf.clip_by_global_norm(
gradients, self.config.max_gradient_norm)
tf.summary.scalar("grad_norm", gradient_norm)
tf.summary.scalar("clipped_norm", tf.global_norm(clipped_gradients))
tf.summary.scalar("learning_rate", self.learning_rate)
train_op = optimizer.apply_gradients(
zip(clipped_gradients, params), global_step=self.global_step)
return train_op
def add_training_op(self, loss):
"""Sets up the training Ops.
Creates an optimizer and applies the gradients to all trainable variables.
The Op returned by this function is what must be passed to the
`sess.run()` call to cause the model to train. See
TODO:
- Get the gradients for the loss from optimizer using
optimizer.compute_gradients.
- if self.clip_gradients is true, clip the global norm of
the gradients using tf.clip_by_global_norm to self.config.max_grad_norm
- Compute the resultant global norm of the gradients using
tf.global_norm and save this global norm in self.grad_norm.
- Finally, actually create the training operation by calling
optimizer.apply_gradients.
See: https://www.tensorflow.org/api_docs/python/train/gradient_clipping
Args:
loss: Loss tensor.
Returns:
train_op: The Op for training.
"""
optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.config.lr)
### YOUR CODE HERE (~6-10 lines)
# - Remember to clip gradients only if self.config.clip_gradients
# is True.
# - Remember to set self.grad_norm
### END YOUR CODE
assert self.grad_norm is not None, "grad_norm was not set properly!"
return train_op
def build_network(self):
state = tf.placeholder(tf.float32, [None, 84, 84, 4])
cnn_1 = slim.conv2d(state, 16, [8,8], stride=4, scope=self.name + '/cnn_1', activation_fn=nn.relu)
cnn_2 = slim.conv2d(cnn_1, 32, [4,4], stride=2, scope=self.name + '/cnn_2', activation_fn=nn.relu)
flatten = slim.flatten(cnn_2)
fcc_1 = slim.fully_connected(flatten, 256, scope=self.name + '/fcc_1', activation_fn=nn.relu)
adv_probas = slim.fully_connected(fcc_1, self.nb_actions, scope=self.name + '/adv_probas', activation_fn=nn.softmax)
value_state = slim.fully_connected(fcc_1, 1, scope=self.name + '/value_state', activation_fn=None)
tf.summary.scalar("model/cnn1_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/cnn_1')))
tf.summary.scalar("model/cnn2_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/cnn_2')))
tf.summary.scalar("model/fcc1_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/fcc_1')))
tf.summary.scalar("model/adv_probas_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/adv_probas')))
tf.summary.scalar("model/value_state_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/value_state')))
#Input
self._tf_state = state
#Output
self._tf_adv_probas = adv_probas
self._tf_value_state = value_state
def __add_summaries(self, grads_and_vars, grad_noise_scale,
dev_set_scope, summaries=[]):
if summaries == []:
return
# Add summary for the noise on the gradient
# -----------------------------------------
if grad_noise_scale is not None:
with tf.name_scope(dev_set_scope):
tf.summary.scalar("NoiseGrad", grad_noise_scale,
summaries)
# Add histograms for variables, grads and grad norms
# --------------------------------------------------
for grad, var in grads_and_vars:
if isinstance(grad, tf.IndexedSlices):
grad_vals = grad.values
else:
grad_vals = grad
if grad_vals is not None:
# Remove model_name/
var_name = var.op.name.replace(
self.cfg.model_name + '/', '')
scope_str = dev_set_scope + '_%s' # metric
scope_str, var_name = squash_maybe(scope_str,
var_name)
scope_str += '_%s' # var name
# Write the summary
with tf.name_scope(None):
tf.summary.scalar(
scope_str % ('GradientNorm', var_name),
tf.global_norm([grad_vals]), summaries)
tf.summary.histogram(
scope_str % ('GradientHist', var_name),
grad_vals, summaries)
# Add global norm summary
# -----------------------
# Remove the name_scopes (the one from the variable_scope
# and the one from the name_scope)
with tf.name_scope(dev_set_scope):
name = ('clipped_grad_norm' if self.cfg.max_grad_norm else
'grad_norm')
tf.summary.scalar('Global_norm/' + name,
tf.global_norm(list(zip(*grads_and_vars))[0]),
summaries)
def createGraph(self):
"""Creates graph for training"""
self.base_cost=0.0
self.accuracy = 0
num_sizes = len(self.bins)
self.cost_list = []
sum_weight=0
self.bin_losses = []
saturation_loss = []
# Create all bins and calculate losses for them
with vs.variable_scope("var_lengths"):
for seqLength,itemCount, ind in zip(self.bins, self.count_list, range(num_sizes)):
x_in = tf.placeholder("int32", [itemCount, seqLength])
y_in = tf.placeholder("int64", [itemCount, seqLength])
self.x_input.append(x_in)
self.y_input.append(y_in)
self.saturation_costs = []
c, a, _, _, perItemCost, _ = self.createLoss(x_in,y_in,seqLength)
weight = 1.0#/seqLength
sat_cost = tf.add_n(self.saturation_costs) / ((seqLength ** 2) * itemCount)
saturation_loss.append(sat_cost*weight)
self.bin_losses.append(perItemCost)
self.base_cost += c * weight
sum_weight+=weight
self.accuracy += a
self.cost_list.append(c)
tf.get_variable_scope().reuse_variables()
# calculate the total loss
self.base_cost /= sum_weight
self.accuracy /= num_sizes
self.sat_loss = tf.reduce_sum(tf.stack(saturation_loss))*self.saturation_weight / sum_weight
cost = self.base_cost + self.sat_loss
# add gradient noise proportional to learning rate
tvars = tf.trainable_variables()
grads_0 = tf.gradients(cost, tvars)
grads = []
for grad in grads_0:
grad1 = grad+tf.truncated_normal(tf.shape(grad)) * self.learning_rate*1e-4
grads.append(grad1)
# optimizer
optimizer = AdamaxOptimizer(self.learning_rate, beta1=0.9, beta2 = 1.0-self.beta2_rate, epsilon=1e-8)
self.optimizer = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)
# some values for printout
max_vals=[]
for var in tvars:
varV = optimizer.get_slot(var, "m")
max_vals.append(varV)
self.gnorm = tf.global_norm(max_vals)
self.cost_list = tf.stack(self.cost_list)
def add_optimizer_op(self, scope):
"""
Set self.train_op and self.grad_norm
"""
##############################################################
"""
TODO: 1. get Adam Optimizer (remember that we defined self.lr in the placeholders
section)
2. compute grads wrt to variables in scope for self.loss
3. clip the grads by norm with self.config.clip_val if self.config.grad_clip
is True
4. apply the gradients and store the train op in self.train_op
(sess.run(train_op) must update the variables)
5. compute the global norm of the gradients and store this scalar
in self.grad_norm
HINT: you may find the following functinos useful
- tf.get_collection
- optimizer.compute_gradients
- tf.clip_by_norm
- optimizer.apply_gradients
- tf.global_norm
you can access config variable by writing self.config.variable_name
(be sure that you set self.train_op and self.grad_norm)
"""
##############################################################
#################### YOUR CODE HERE - 8-12 lines #############
optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
gradients, v = list(zip(*optimizer.compute_gradients(self.loss, variables)))
if self.config.grad_clip:
gradients, _ = tf.clip_by_global_norm(gradients, self.config.clip_val)
# Use the clipped gradients for optimization
self.grad_norm = tf.global_norm(gradients)
self.train_op = optimizer.apply_gradients(list(zip(gradients, v)))
##############################################################
######################## END YOUR CODE #######################
def _createModel(self):
with tf.variable_scope(self.scope):
self.inputs = tf.placeholder('float', shape=[None,self.stateSize])
x1 = slim.fully_connected(
self.inputs,
64,
scope='fc/fc_1',
activation_fn=tf.nn.relu)
self.policy = slim.fully_connected(x1, self.actionSize,
activation_fn=tf.nn.softmax,
weights_initializer=Brian.normalized_columns_initializer(0.01),
biases_initializer=None)
self.value = slim.fully_connected(x1,1,
activation_fn=None,
weights_initializer=Brian.normalized_columns_initializer(1.0),
biases_initializer=None)
self.update_local_ops = Brian.update_target_graph('global',self.scope)
if self.scope != 'global':
self.actions = tf.placeholder( shape=[None], dtype=tf.int32)
self.actions_onehot = tf.one_hot(self.actions, self.actionSize, dtype=tf.float32)
self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)
self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])
#Loss functions
self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01
#Get gradients from local network using local losses
local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
self.gradients = tf.gradients(self.loss,local_vars)
self.var_norms = tf.global_norm(local_vars)
grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)
#Apply local gradients to global network
global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
self.apply_grads = self.trainer.apply_gradients(zip(grads,global_vars))