def _create_optimizer(self):
lr = self.optimizer_args['lr'] if self.optimizer_args else 1e-4
with tf.variable_scope('optimize', reuse=not self.is_train) as scope:
# Setup global_step, optimizer
self.global_step = tf.get_variable('global_step', shape=(), initializer=tf.constant_initializer(0.0), trainable=False)
self.learning_rate = tf.train.exponential_decay(lr, self.global_step, 1e5, 0.9, staircase=True)
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name='optimizer')
# According to original paper code, learning rate of bias is 2x of base learning rate
grads_vars = self.optimizer.compute_gradients(self.loss)
bias_pattern = re.compile('.*/b')
grads_vars_mult = []
for grad, var in grads_vars:
if bias_pattern.match(var.op.name):
grads_vars_mult.append((grad * 2.0, var))
else:
grads_vars_mult.append((grad, var))
# According to original paper, gradient should be clipped with [-0.1, 0.1]
grads_clip = [(tf.clip_by_value(grad, -0.1, 0.1), var) for grad, var in grads_vars_mult]
self.train = self.optimizer.apply_gradients(grads_clip, global_step=self.global_step)
model.py 文件源码
python
阅读 35
收藏 0
点赞 0
评论 0
评论列表
文章目录