def build_shared_grad(self):
self.grads = tf.gradients(self.loss, self.local_network.var_list)
clipped_grads, _ = tf.clip_by_global_norm(self.grads, self.config.max_grad_norm)
# copy weights from the parameter server to the local model
self.sync = tf.group(*[v1.assign(v2) for v1, v2 in zip(self.local_network.var_list, self.network.var_list)])
grads_and_vars = list(zip(clipped_grads, self.network.var_list))
inc_step = self.global_step.assign_add(tf.shape(self.local_network.x)[0])
# each worker has a different set of adam optimizer parameters
self.lr = tf.train.exponential_decay(
self.config.lr_start, self.global_step, self.config.lr_decay_step,
self.config.lr_decay_rate, staircase=True, name='lr')
opt = tf.train.AdamOptimizer(self.lr)
self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step)
self.summary_writer = None
self.local_steps = 0
self.build_summary()
python类clip_by_global_norm()的实例源码
def clip(grads_and_vars, max_global_norm):
""" Clip the gradients that are returned from a TensorFlow Optimizer.
Note that the term "clipping" is often used in literature but here is actually
the wrong term: if the norm of all gradients concatenated does not exceed
`max_global_norm`, then don't modify them. If the norm does exceed
`max_global_norm`, then rescale all gradients globally so that the new norm
becomes `max_global_norm`.
Args:
grads_and_vars: A list of `(grad, var)` pairs.
max_global_norm: A float.
Returns:
A list of `(grad, var)` pairs with clipped gradients.
"""
grads, vars = zip(*grads_and_vars)
grads, _ = tf.clip_by_global_norm(grads, clip_norm=max_global_norm)
grads_and_vars = list(zip(grads, vars))
return grads_and_vars
def _training(self, loss, config):
"""Sets up training ops
Creates the optimiser
The op returned from this is what is passed to session run
Args:
loss float
learning_rate float
returns:
Op for training
"""
# Create the gradient descent optimizer with the
# given learning rate.
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
config.max_grad_norm)
optimizer = tf.train.AdamOptimizer()
train_op = optimizer.apply_gradients(zip(grads, tvars))
return train_op
def get_gradient_clipper(clipper, *args, **kwargs):
"""
Simple helper to get Gradient Clipper
E.g: clipper = get_gradient_clipper('value', value_min, value_max, name='ValueClip')
:param clipper: a string denoting TF Gradient Clipper (e.g. "global_norm", denote tf.clip_by_global_norm)
or a function of type f(tensor) -> clipped_tensor
:param args: used to create the clipper
:param kwargs: used to create the clipper
:return: a function (tensor) -> (clipped tensor)
"""
if callable(clipper):
return clipper
# workaround of global_norm clipper, since it returns two variable with the second one as a scalar tensor
if clipper == 'global_norm':
return lambda t_list: tf.clip_by_global_norm(t_list, *args, **kwargs)[0]
if clipper in _str2clipper:
clipper = _str2clipper[clipper]
else:
raise ValueError('clipper should be a callable function or a given key in _str2clipper!')
return lambda t_list: [clipper(t, *args, **kwargs) for t in t_list]
def _add_train_op(self):
# op????
"""Sets self._train_op, op to run for training."""
hps = self._hps
self._lr_rate = tf.maximum(
hps.min_lr, # min_lr_rate.
tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98))
tvars = tf.trainable_variables()
with tf.device(self._get_gpu(self._num_gpus-1)):
grads, global_norm = tf.clip_by_global_norm(
tf.gradients(self._loss, tvars), hps.max_grad_norm)
tf.scalar_summary('global_norm', global_norm)
optimizer = tf.train.GradientDescentOptimizer(self._lr_rate)
tf.scalar_summary('learning rate', self._lr_rate)
self._train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=self.global_step, name='train_step')
def attach_cost(self, gen_model):
# TODO: Shouldn't dynamic RNN be used here?
# output_text, states_text = rnn.rnn(cell, inputs, initial_state=self.initial_state)
predicted_classes_text = self.discriminate_text(self.input_data_text)
self.loss_text = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(predicted_classes_text, np.ones((self.args.batch_size, 1), dtype=np.float32)))
generated_wv = gen_model.generate()
predicted_classes_wv = self.discriminate_wv(generated_wv)
self.loss_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(predicted_classes_wv, np.zeros((self.args.batch_size, 1), dtype=np.float32)))
self.loss = .5 * self.loss_gen + .5 * self.loss_text
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
self.args.grad_clip)
# optimize only discriminator owned variables
g_and_v = [(g, v) for g, v in zip(grads, tvars) if v.name.startswith('DISC')]
optimizer = tf.train.AdamOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(g_and_v)
def _create_training_tensors(self, optimizer_algorithm):
"""
Create the tensors used for training
"""
with tf.name_scope('training'):
if optimizer_algorithm == 'adagrad':
optimizer = tf.train.AdagradOptimizer(self.learning_rate)
elif optimizer_algorithm == 'adam':
optimizer = tf.train.AdamOptimizer(self.learning_rate)
elif optimizer_algorithm == 'adadelta':
optimizer = tf.train.AdadeltaOptimizer(self.learning_rate)
else:
ValueError('Unknown optimizer: %s' % optimizer_algorithm)
gradients, v = zip(*optimizer.compute_gradients(self.loss))
if self.clip_value is not None:
gradients, _ = tf.clip_by_global_norm(gradients,
self.clip_value)
self.train_op = optimizer.apply_gradients(zip(gradients, v))
def add_train_op(self, loss):
self.global_step = tf.Variable(0, name='global_step', trainable=False)
opt = tf.train.AdamOptimizer(learning_rate=self.lr)
gradients, variables = zip(*opt.compute_gradients(loss))
# save selected gradient summaries
#for grad in gradients:
#if 'BasicDecoder' in grad.name or 'gru_cell' in grad.name or 'highway_3' in grad.name:
#tf.summary.scalar(grad.name, tf.reduce_sum(grad))
# optionally cap and noise gradients to regularize
if self.config.cap_grads > 0:
with tf.variable_scope('cap_grads'):
tf.summary.scalar('global_gradient_norm', tf.global_norm(gradients))
gradients, _ = tf.clip_by_global_norm(gradients, self.config.cap_grads)
train_op = opt.apply_gradients(zip(gradients, variables), global_step=self.global_step)
return train_op
def _build_train_op(self):
"""Build training specific ops for the graph."""
self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
tf.scalar_summary(self.mode + '/learning rate', self.lrn_rate)
trainable_variables = tf.trainable_variables()
grads = tf.gradients(self.cost, trainable_variables)
if self.hps.optimizer == 'sgd':
optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate)
elif self.hps.optimizer == 'mom':
#optimizer = tf.train.AdamOptimizer(0.001)
#ooptimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9, use_nesterov=True)
optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9)
clipped_grads, _ = tf.clip_by_global_norm(grads, 1)
apply_op = optimizer.apply_gradients(
zip(clipped_grads, trainable_variables),
global_step=self.global_step, name='train_step')
train_ops = [apply_op] + self._extra_train_ops + tf.get_collection(tf.GraphKeys.UPDATE_OPS)
self.train_op = tf.group(*train_ops)
def train_neural_network():
logits, last_state, _, _, _ = neural_network()
targets = tf.reshape(output_targets, [-1])
loss = tf.nn.seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(words))
cost = tf.reduce_mean(loss)
learning_rate = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(grads, tvars))
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
saver = tf.train.Saver(tf.all_variables())
for epoch in range(50):
sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
n = 0
for batche in range(n_chunk):
train_loss, _ , _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]})
n += 1
print(epoch, batche, train_loss)
if epoch % 7 == 0:
saver.save(sess, 'poetry.module', global_step=epoch)
def _clip_gradients_fn(self, grads_and_vars):
"""Clips gradients by global norm."""
gradients, variables = zip(*grads_and_vars)
self._grads_and_vars = grads_and_vars
if self._clip_gradients > 0.0:
clipped_gradients, _ = tf.clip_by_global_norm(
t_list=gradients, clip_norm=self._clip_gradients)
grads_and_vars = list(zip(clipped_gradients, variables))
if self._clip_embed_gradients > 0.0:
clipped_gradients = []
variables = []
for gradient, variable in grads_and_vars:
if "embedding" in variable.name or "Embedding" in variable.name:
tmp = tf.clip_by_norm(t=gradient.values, clip_norm=self._clip_embed_gradients)
gradient = tf.IndexedSlices(tmp, gradient.indices, gradient.dense_shape)
clipped_gradients.append(gradient)
variables.append(variable)
grads_and_vars = list(zip(clipped_gradients, variables))
return grads_and_vars
def compute_gradients(loss, learning_rate, gradient_clipping=-1):
"""
Create optimizer, compute gradients and (optionally) apply gradient clipping
"""
opt = tf.train.AdamOptimizer(learning_rate)
if gradient_clipping > 0:
vars_to_optimize = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, vars_to_optimize), clip_norm=gradient_clipping)
grads_and_vars = list(zip(grads, vars_to_optimize))
else:
grads_and_vars = opt.compute_gradients(loss)
return opt, grads_and_vars
def _clip_gradients(self, grads_and_vars):
"""Clips gradients by global norm."""
gradients, variables = zip(*grads_and_vars)
clipped_gradients, _ = tf.clip_by_global_norm(
gradients, self.params["optimizer.clip_gradients"])
return list(zip(clipped_gradients, variables))
def TrainingOp(loss, dataSetSize, batch_size, max_grad_norm):
var_list = tf.trainable_variables()
grads = tf.gradients(loss, var_list)
grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
global_step = tf.get_variable(
'global_step', [], initializer=tf.constant_initializer(0), trainable=False
)
training_steps_per_epoch = dataSetSize // batch_size
learning_rate = tf.train.exponential_decay(
1e-3, global_step, training_steps_per_epoch, 0.999,staircase=True)
optimizer = tf.train.RMSPropOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(grads, var_list), global_step=global_step)
return train_op, learning_rate
def clip_gradient(pair_list,
max_norm):
"""Perform gradient clipping.
If the gradients' global norm exceed 'max_norm', then shrink it to 'max_norm'.
:param pair_list: (grad, var) pair list.
:param max_norm: The max global norm.
:return: (grad, var) pair list, the original gradients' norm, the clipped gradients' norm
"""
grad_list = [grad for grad, _ in pair_list]
grad_list, raw_grad = tf.clip_by_global_norm(grad_list, max_norm)
grad = tf.global_norm(grad_list)
pair_list = [(grad, pair[1]) for grad, pair in zip(grad_list, pair_list)]
return pair_list, raw_grad, grad
def _backward(self, loss, summaries=False):
hps = self.hps
loss = loss * hps.num_steps
emb_vars = find_trainable_variables("emb")
lstm_vars = find_trainable_variables("LSTM")
softmax_vars = find_trainable_variables("softmax")
all_vars = emb_vars + lstm_vars + softmax_vars
grads = tf.gradients(loss, all_vars)
orig_grads = grads[:]
emb_grads = grads[:len(emb_vars)]
grads = grads[len(emb_vars):]
for i in range(len(emb_grads)):
assert isinstance(emb_grads[i], tf.IndexedSlices)
emb_grads[i] = tf.IndexedSlices(emb_grads[i].values * hps.batch_size, emb_grads[i].indices,
emb_grads[i].dense_shape)
lstm_grads = grads[:len(lstm_vars)]
softmax_grads = grads[len(lstm_vars):]
lstm_grads, lstm_norm = tf.clip_by_global_norm(lstm_grads, hps.max_grad_norm)
clipped_grads = emb_grads + lstm_grads + softmax_grads
assert len(clipped_grads) == len(orig_grads)
if summaries:
tf.scalar_summary("model/lstm_grad_norm", lstm_norm)
tf.scalar_summary("model/lstm_grad_scale", tf.minimum(hps.max_grad_norm / lstm_norm, 1.0))
tf.scalar_summary("model/lstm_weight_norm", tf.global_norm(lstm_vars))
# for v, g, cg in zip(all_vars, orig_grads, clipped_grads):
# name = v.name.lstrip("model/")
# tf.histogram_summary(name + "/var", v)
# tf.histogram_summary(name + "/grad", g)
# tf.histogram_summary(name + "/clipped_grad", cg)
return list(zip(clipped_grads, all_vars))
def apply_gradients(self, grads):
coldOptim = tf.train.MomentumOptimizer(
self._cold_lr, self._momentum)
def coldSGDstart():
sgd_grads, sgd_var = zip(*grads)
if self.max_grad_norm != None:
sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm)
sgd_grads = list(zip(sgd_grads,sgd_var))
sgd_step_op = tf.assign_add(self.sgd_step, 1)
coldOptim_op = coldOptim.apply_gradients(sgd_grads)
if KFAC_DEBUG:
with tf.control_dependencies([sgd_step_op, coldOptim_op]):
sgd_step_op = tf.Print(
sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')])
return tf.group(*[sgd_step_op, coldOptim_op])
kfacOptim_op, qr = self.apply_gradients_kfac(grads)
def warmKFACstart():
return kfacOptim_op
return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr
def __call__(self, enc_input, dec_input_indices, valid_indices, left_indices, right_indices, values, valid_masks=None):
batch_size = tf.shape(enc_input)[0]
# forward computation graph
with tf.variable_scope(self.scope):
# encoder output
enc_memory, enc_final_state_fw, _ = self.encoder(enc_input)
# decoder
dec_hiddens, dec_actions, dec_act_logps = self.decoder(
enc_memory, dec_input_indices,
valid_indices, left_indices, right_indices,
valid_masks, init_state=enc_final_state_fw)
# cost
costs = []
update_ops = []
for step_idx, (act_logp, value, baseline) in enumerate(zip(dec_act_logps, values, self.baselines)):
# costs.append(-tf.reduce_mean(act_logp * (value - baseline)))
new_baseline = self.bl_ratio * baseline + (1-self.bl_ratio) * tf.reduce_mean(value)
costs.append(-tf.reduce_mean(act_logp * value))
update_ops.append(tf.assign(baseline, new_baseline))
# gradient computation graph
self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope)
train_ops = []
for limit in self.buckets:
print '0 ~ %d' % (limit-1)
grad_params = tf.gradients(tf.reduce_sum(tf.pack(costs[:limit])), self.params)
if self.max_grad_norm is not None:
clipped_gradients, norm = tf.clip_by_global_norm(grad_params, self.max_grad_norm)
else:
clipped_gradients = grad_params
train_op = self.optimizer.apply_gradients(
zip(clipped_gradients, self.params))
with tf.control_dependencies([train_op] + update_ops[:limit]):
# train_ops.append(tf.Print(tf.constant(1.), [norm]))
train_ops.append(tf.constant(1.))
return dec_hiddens, dec_actions, train_ops
#### test script
def define(self, char_num, rnn_dim, emb_dim, max_x, max_y, write_trans_model=True):
self.decode_step = max_y
self.encode_step = max_x
self.en_vec = [tf.placeholder(tf.int32, [None], name='en_input' + str(i)) for i in range(max_x)]
self.trans_labels = [tf.placeholder(tf.int32, [None], name='de_input' + str(i)) for i in range(max_y)]
weights = [tf.cast(tf.sign(ot_t), tf.float32) for ot_t in self.trans_labels]
self.de_vec = [tf.zeros_like(self.trans_labels[0], tf.int32)] + self.trans_labels[:-1]
self.feed_previous = tf.placeholder(tf.bool)
self.trans_l_rate = tf.placeholder(tf.float32, [], name='learning_rate')
seq_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_dim, state_is_tuple=True)
self.trans_output, states = seq2seq.embedding_attention_seq2seq(self.en_vec, self.de_vec, seq_cell, char_num,
char_num, emb_dim, feed_previous=self.feed_previous)
loss = seq2seq.sequence_loss(self.trans_output, self.trans_labels, weights)
optimizer = tf.train.AdagradOptimizer(learning_rate=self.trans_l_rate)
params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, norm = tf.clip_by_global_norm(gradients, 5.0)
self.trans_train = optimizer.apply_gradients(zip(clipped_gradients, params))
self.saver = tf.train.Saver()
if write_trans_model:
param_dic = {}
param_dic['char_num'] = char_num
param_dic['rnn_dim'] = rnn_dim
param_dic['emb_dim'] = emb_dim
param_dic['max_x'] = max_x
param_dic['max_y'] = max_y
# print param_dic
f_model = open(self.trained + '_model', 'w')
pickle.dump(param_dic, f_model)
f_model.close()
def build(H, dat, sess):
with open(META_DIR + 'fp.json') as fpj:
meta = json.load(fpj)
bsize = H['batch_size']
x = tf.placeholder(tf.float32, shape = [64, 64, 1])
y = tf.placeholder(tf.float32, shape = [1,])
training = tf.placeholder(tf.bool)
fptrunk = FPTrunk(dat, x, y, bsize, sess)
Xt, Yt = tf.train.batch(fptrunk.q['train'].dequeue(), batch_size = bsize, capacity = bsize)
Xv, Yv = tf.train.batch(fptrunk.q['valid'].dequeue(), batch_size = bsize, capacity = bsize)
logits, preds = model(H, Xt, training)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
logits=logits, labels=tf.cast(Yt, tf.float32)))
varst = tf.trainable_variables()
gstep = tf.Variable(0, trainable = False)
opts = {
'RMS': tf.train.RMSPropOptimizer,
'Adam': tf.train.AdamOptimizer,
'SGD': tf.train.GradientDescentOptimizer,
'Adagrad': tf.train.AdagradOptimizer
}
opt = opts[H['opt']](learning_rate = H['lr'])
grads_vars = opt.compute_gradients(loss, varst)
grads = [gv[0] for gv in grads_vars]
vars = [gv[1] for gv in grads_vars]
capped, norm = tf.clip_by_global_norm(grads, H['norm_clip'])
train_opt = opt.apply_gradients([(capped[i], vars[i]) for i in range(len(vars))], global_step = gstep)
saver = tf.train.Saver(max_to_keep = None)
return (x, y, training, Xt, Yt, Xv, Yv, logits, loss, preds, opt, varst, gstep, train_opt, saver, fptrunk)