def _generate_labels(self, overlaps):
labels = tf.Variable(tf.ones(shape=(tf.shape(overlaps)[0],), dtype=tf.float32) * -1, trainable=False,
validate_shape=False)
gt_max_overlaps = tf.arg_max(overlaps, dimension=0)
anchor_max_overlaps = tf.arg_max(overlaps, dimension=1)
mask = tf.one_hot(anchor_max_overlaps, tf.shape(overlaps)[1], on_value=True, off_value=False)
max_overlaps = tf.boolean_mask(overlaps, mask)
if self._debug:
max_overlaps = tf.Print(max_overlaps, [max_overlaps])
labels = tf.scatter_update(labels, gt_max_overlaps, tf.ones((tf.shape(gt_max_overlaps)[0],)))
# TODO: extract config object
over_threshold_mask = tf.reshape(tf.where(max_overlaps > 0.5), (-1,))
if self._debug:
over_threshold_mask = tf.Print(over_threshold_mask, [over_threshold_mask], message='over threshold index : ')
labels = tf.scatter_update(labels, over_threshold_mask, tf.ones((tf.shape(over_threshold_mask)[0],)))
# TODO: support clobber positive in the origin implement
below_threshold_mask = tf.reshape(tf.where(max_overlaps < 0.3), (-1,))
if self._debug:
below_threshold_mask = tf.Print(below_threshold_mask, [below_threshold_mask], message='below threshold index : ')
labels = tf.scatter_update(labels, below_threshold_mask, tf.zeros((tf.shape(below_threshold_mask)[0],)))
return labels
python类scatter_update()的实例源码
def test_scatter_nd_2():
gt_bboxes = tf.constant([[0,0,1,2],[1,0,3,4],[100,100,105,102.5]])
gt_labels = tf.constant([1,2,6])
gt_anchors_labels = tf.Variable([100,100,100,100], trainable=False,collections=[ops.GraphKeys.LOCAL_VARIABLES])
gt_anchors_bboxes=tf.Variable([[100,100,105,105],[2,1,3,3.5],[0,0,10,10],[0.5,0.5,0.8,1.5]], trainable=False,collections=[ops.GraphKeys.LOCAL_VARIABLES],dtype=tf.float32)
max_inds = [1,0,3]
gt_anchors_labels = tf.scatter_update(gt_anchors_labels, max_inds,gt_labels)
gt_anchors_bboxes = tf.scatter_update(gt_anchors_bboxes, max_inds,gt_bboxes)
return gt_anchors_labels,gt_anchors_bboxes
def make_update_op(self, upd_idxs, upd_keys, upd_vals,
batch_size, use_recent_idx, intended_output):
"""Function that creates all the update ops."""
mem_age_incr = self.mem_age.assign_add(tf.ones([self.memory_size],
dtype=tf.float32))
with tf.control_dependencies([mem_age_incr]):
mem_age_upd = tf.scatter_update(
self.mem_age, upd_idxs, tf.zeros([batch_size], dtype=tf.float32))
mem_key_upd = tf.scatter_update(
self.mem_keys, upd_idxs, upd_keys)
mem_val_upd = tf.scatter_update(
self.mem_vals, upd_idxs, upd_vals)
if use_recent_idx:
recent_idx_upd = tf.scatter_update(
self.recent_idx, intended_output, upd_idxs)
else:
recent_idx_upd = tf.group()
return tf.group(mem_age_upd, mem_key_upd, mem_val_upd, recent_idx_upd)
def insert(self, ids, scores):
"""Insert the ids and scores into the TopN."""
with tf.control_dependencies(self.last_ops):
scatter_op = tf.scatter_update(self.id_to_score, ids, scores)
larger_scores = tf.greater(scores, self.sl_scores[0])
def shortlist_insert():
larger_ids = tf.boolean_mask(tf.to_int64(ids), larger_scores)
larger_score_values = tf.boolean_mask(scores, larger_scores)
shortlist_ids, new_ids, new_scores = self.ops.top_n_insert(
self.sl_ids, self.sl_scores, larger_ids, larger_score_values)
u1 = tf.scatter_update(self.sl_ids, shortlist_ids, new_ids)
u2 = tf.scatter_update(self.sl_scores, shortlist_ids, new_scores)
return tf.group(u1, u2)
# We only need to insert into the shortlist if there are any
# scores larger than the threshold.
cond_op = tf.cond(
tf.reduce_any(larger_scores), shortlist_insert, tf.no_op)
with tf.control_dependencies([cond_op]):
self.last_ops = [scatter_op, cond_op]
def remove(self, ids):
"""Remove the ids (and their associated scores) from the TopN."""
with tf.control_dependencies(self.last_ops):
scatter_op = tf.scatter_update(
self.id_to_score,
ids,
tf.ones_like(
ids, dtype=tf.float32) * tf.float32.min)
# We assume that removed ids are almost always in the shortlist,
# so it makes no sense to hide the Op behind a tf.cond
shortlist_ids_to_remove, new_length = self.ops.top_n_remove(self.sl_ids,
ids)
u1 = tf.scatter_update(
self.sl_ids, tf.concat(0, [[0], shortlist_ids_to_remove]),
tf.concat(0, [new_length,
tf.ones_like(shortlist_ids_to_remove) * -1]))
u2 = tf.scatter_update(
self.sl_scores,
shortlist_ids_to_remove,
tf.float32.min * tf.ones_like(
shortlist_ids_to_remove, dtype=tf.float32))
self.last_ops = [scatter_op, u1, u2]
def scatter_update(cls, factor, indices, values, sharding_func):
"""Helper function for doing sharded scatter update."""
assert isinstance(factor, list)
if len(factor) == 1:
with ops.colocate_with(factor[0]):
# TODO(agarwal): assign instead of scatter update for full batch update.
return tf.scatter_update(factor[0], indices, values).op
else:
num_shards = len(factor)
assignments, new_ids = sharding_func(indices)
assert assignments is not None
assignments = tf.cast(assignments, tf.int32)
sharded_ids = tf.dynamic_partition(new_ids, assignments, num_shards)
sharded_values = tf.dynamic_partition(values, assignments, num_shards)
updates = []
for i in xrange(num_shards):
updates.append(tf.scatter_update(factor[i],
sharded_ids[i],
sharded_values[i]))
return tf.group(*updates)
def insert(self, ids, scores):
"""Insert the ids and scores into the TopN."""
with tf.control_dependencies(self.last_ops):
scatter_op = tf.scatter_update(self.id_to_score, ids, scores)
larger_scores = tf.greater(scores, self.sl_scores[0])
def shortlist_insert():
larger_ids = tf.boolean_mask(tf.to_int64(ids), larger_scores)
larger_score_values = tf.boolean_mask(scores, larger_scores)
shortlist_ids, new_ids, new_scores = self.ops.top_n_insert(
self.sl_ids, self.sl_scores, larger_ids, larger_score_values)
u1 = tf.scatter_update(self.sl_ids, shortlist_ids, new_ids)
u2 = tf.scatter_update(self.sl_scores, shortlist_ids, new_scores)
return tf.group(u1, u2)
# We only need to insert into the shortlist if there are any
# scores larger than the threshold.
cond_op = tf.cond(
tf.reduce_any(larger_scores), shortlist_insert, tf.no_op)
with tf.control_dependencies([cond_op]):
self.last_ops = [scatter_op, cond_op]
def remove(self, ids):
"""Remove the ids (and their associated scores) from the TopN."""
with tf.control_dependencies(self.last_ops):
scatter_op = tf.scatter_update(
self.id_to_score,
ids,
tf.ones_like(
ids, dtype=tf.float32) * tf.float32.min)
# We assume that removed ids are almost always in the shortlist,
# so it makes no sense to hide the Op behind a tf.cond
shortlist_ids_to_remove, new_length = self.ops.top_n_remove(self.sl_ids,
ids)
u1 = tf.scatter_update(
self.sl_ids, tf.concat(0, [[0], shortlist_ids_to_remove]),
tf.concat(0, [new_length,
tf.ones_like(shortlist_ids_to_remove) * -1]))
u2 = tf.scatter_update(
self.sl_scores,
shortlist_ids_to_remove,
tf.float32.min * tf.ones_like(
shortlist_ids_to_remove, dtype=tf.float32))
self.last_ops = [scatter_op, u1, u2]
def _sparse_moving_average(self, x_tm1, idxs, a_t_, name, beta=.9):
""""""
b_tm1 = self.get_accumulator(x_tm1, '%s' % name)
b_tm1_ = tf.gather(b_tm1, idxs)
shape = self.get_variable_shape(x_tm1)
tm1 = self.get_accumulator(x_tm1, '%s/tm1' % name, shape=[shape[0]]+[1]*(len(shape)-1))
tm1_ = tf.gather(tm1, idxs)
t = tf.scatter_add(tm1, idxs, tf.ones_like(tm1_))
t_ = tf.gather(t, idxs)
if beta < 1:
beta_t = tf.convert_to_tensor(beta, name='%s/decay' % name)
beta_t_ = beta_t * (1-beta_t**tm1_) / (1-beta_t**t_)
else:
beta_t_ = tm1_/t_
b_t = tf.scatter_update(b_tm1, idxs, beta_t_*b_tm1_)
b_t = tf.scatter_add(b_t, idxs, (1-beta_t_)*a_t_)
return b_t, t
#=============================================================
def _sparse_moving_average(self, x_tm1, idxs, a_t_, name, beta=.9):
""""""
b_tm1 = self.get_accumulator(x_tm1, '%s' % name)
b_tm1_ = tf.gather(b_tm1, idxs)
shape = self.get_variable_shape(x_tm1)
tm1 = self.get_accumulator(x_tm1, '%s/tm1' % name, shape=[shape[0]]+[1]*(len(shape)-1))
tm1_ = tf.gather(tm1, idxs)
t = tf.scatter_add(tm1, idxs, tf.ones_like(tm1_))
t_ = tf.gather(t, idxs)
if beta < 1:
beta_t = tf.convert_to_tensor(beta, name='%s/decay' % name)
beta_t_ = beta_t * (1-beta_t**tm1_) / (1-beta_t**t_)
else:
beta_t_ = tm1_/t_
b_t = tf.scatter_update(b_tm1, idxs, beta_t_*b_tm1_)
b_t = tf.scatter_add(b_t, idxs, (1-beta_t_)*a_t_)
return b_t, t
#=============================================================
def curvature_range(self):
# set up the curvature window
self._curv_win = \
tf.Variable(np.zeros( [self._curv_win_width, ] ), dtype=tf.float32, name="curv_win", trainable=False)
self._curv_win = tf.scatter_update(self._curv_win,
self._global_step % self._curv_win_width, self._grad_norm_squared)
# note here the iterations start from iteration 0
valid_window = tf.slice(self._curv_win, tf.constant( [0, ] ),
tf.expand_dims(tf.minimum(tf.constant(self._curv_win_width), self._global_step + 1), dim=0) )
self._h_min_t = tf.reduce_min(valid_window)
self._h_max_t = tf.reduce_max(valid_window)
curv_range_ops = []
with tf.control_dependencies([self._h_min_t, self._h_max_t] ):
avg_op = self._moving_averager.apply([self._h_min_t, self._h_max_t] )
with tf.control_dependencies([avg_op] ):
self._h_min = tf.identity(self._moving_averager.average(self._h_min_t) )
self._h_max = tf.identity(self._moving_averager.average(self._h_max_t) )
curv_range_ops.append(avg_op)
return curv_range_ops
def get_mu_tensor(self):
const_fact = self._dist_to_opt_avg**2 * self._h_min**2 / 2 / self._grad_var
coef = tf.Variable([-1.0, 3.0, 0.0, 1.0], dtype=tf.float32, name="cubic_solver_coef")
coef = tf.scatter_update(coef, tf.constant(2), -(3 + const_fact) )
roots = tf.py_func(np.roots, [coef], Tout=tf.complex64, stateful=False)
# filter out the correct root
root_idx = tf.logical_and(tf.logical_and(tf.greater(tf.real(roots), tf.constant(0.0) ),
tf.less(tf.real(roots), tf.constant(1.0) ) ), tf.less(tf.abs(tf.imag(roots) ), 1e-5) )
# in case there are two duplicated roots satisfying the above condition
root = tf.reshape(tf.gather(tf.gather(roots, tf.where(root_idx) ), tf.constant(0) ), shape=[] )
tf.assert_equal(tf.size(root), tf.constant(1) )
dr = self._h_max / self._h_min
mu = tf.maximum(tf.real(root)**2, ( (tf.sqrt(dr) - 1)/(tf.sqrt(dr) + 1) )**2)
return mu
def _thin_stack_update_gradient(op, stack_grad, *rest):
stack = op.inputs[2]
batch_size = op.inputs[4].get_shape().as_list()[0]
t = op.get_attr("timestep")
# We usually slice off the head of the stack output in feedforward and
# send it off to downstream computation. The Slice feedforward op will
# generate a sparse gradient in the backward pass. Nix this sparsity
# at the very start.
if isinstance(stack_grad, ops.IndexedSlices):
# Trick: re-use our stack structure to store new gradients.
# Recover the original stack variable from the lookup/update chain.
stack = _fetch_stack(stack)
stack = tf.assign(stack, tf.zeros_like(stack))
stack = tf.scatter_update(stack, stack_grad.indices, stack_grad.values)
stack_grad = stack
with tf.control_dependencies([stack_grad]):
input_grad = tf.slice(stack_grad, [t * batch_size, 0], [batch_size, -1])
return input_grad, None, stack_grad, None, None, None
def _sparse_moving_average(self, x_tm1, idxs, a_t_, name, beta=.9):
""""""
b_tm1 = self.get_accumulator(x_tm1, '%s' % name)
b_tm1_ = tf.gather(b_tm1, idxs)
shape = self.get_variable_shape(x_tm1)
tm1 = self.get_accumulator(x_tm1, '%s/tm1' % name, shape=[shape[0]]+[1]*(len(shape)-1))
tm1_ = tf.gather(tm1, idxs)
t = tf.scatter_add(tm1, idxs, tf.ones_like(tm1_))
t_ = tf.gather(t, idxs)
if beta < 1:
beta_t = tf.convert_to_tensor(beta, name='%s/decay' % name)
beta_t_ = beta_t * (1-beta_t**tm1_) / (1-beta_t**t_)
else:
beta_t_ = tm1_/t_
b_t = tf.scatter_update(b_tm1, idxs, beta_t_*b_tm1_)
b_t = tf.scatter_add(b_t, idxs, (1-beta_t_)*a_t_)
return b_t, t
#=============================================================
def test_scatter_update():
a = tf.Variable(initial_value=[2, 5, -4, 0])
b = tf.scatter_update(a, [2,2], [9,100])
return b
def _curvature_range(self):
"""Curvature range.
Returns:
h_max_t, h_min_t ops
"""
self._curv_win = tf.get_variable("curv_win",
dtype=tf.float32,
trainable=False,
shape=[self.curvature_window_width, ],
initializer=tf.zeros_initializer)
# We use log smoothing for curvature range
self._curv_win = tf.scatter_update(self._curv_win,
self._step % self.curvature_window_width,
tf.log(self._grad_norm_squared))
# Note here the iterations start from iteration 0
valid_window = tf.slice(self._curv_win,
tf.constant([0, ]),
tf.expand_dims(
tf.minimum(
tf.constant(
self.curvature_window_width),
self._step + 1), dim=0))
self._h_min_t = tf.reduce_min(valid_window)
self._h_max_t = tf.reduce_max(valid_window)
curv_range_ops = []
with tf.control_dependencies([self._h_min_t, self._h_max_t]):
avg_op = self._moving_averager.apply(
[self._h_min_t, self._h_max_t])
with tf.control_dependencies([avg_op]):
self._h_min = tf.exp(
tf.identity(self._moving_averager.average(self._h_min_t)))
self._h_max = tf.exp(
tf.identity(self._moving_averager.average(self._h_max_t)))
if self._sparsity_debias:
self._h_min *= self._sparsity_avg
self._h_max *= self._sparsity_avg
curv_range_ops.append(avg_op)
return curv_range_ops # h_max_t, h_min_t
def _apply_sparse(self, grad, var):
beta1_power = tf.cast(self._beta1_power, var.dtype.base_dtype)
beta2_power = tf.cast(self._beta2_power, var.dtype.base_dtype)
lr_t = tf.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = tf.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = tf.cast(self._epsilon_t, var.dtype.base_dtype)
lr = (lr_t * tf.sqrt(1 - beta2_power) / (1 - beta1_power))
# m := beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_t = tf.scatter_update(m, grad.indices,
beta1_t * tf.gather(m, grad.indices) +
(1 - beta1_t) * grad.values,
use_locking=self._use_locking)
# v := beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_t = tf.scatter_update(v, grad.indices,
beta2_t * tf.gather(v, grad.indices) +
(1 - beta2_t) *
tf.square(grad.values),
use_locking=self._use_locking)
# variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
m_t_slice = tf.gather(m_t, grad.indices)
v_t_slice = tf.gather(v_t, grad.indices)
denominator_slice = tf.sqrt(v_t_slice) + epsilon_t
var_update = tf.scatter_sub(var, grad.indices,
lr * m_t_slice / denominator_slice,
use_locking=self._use_locking)
return tf.group(var_update, m_t, v_t)
def get_state_update_op(state_variables, new_states):
"""Returns an operation to update an LSTM's state variables.
See get_state_variables() for more info.
Args:
state_variables (tuple[tf.contrib.rnn.LSTMStateTuple]): The LSTM's state variables.
new_states (tuple[tf.contrib.rnn.LSTMStateTuple]): The new values for the state variables.
new_states may have state tuples with state sizes < max_batch_size. Then, only the first
rows of the corresponding state variables will be updated.
Returns:
tf.Operation: An operation that updates the LSTM's.
"""
# Add an operation to update the train states with the last state tensors.
update_ops = []
for state_variable, new_state in zip(state_variables, new_states):
# new_state[0] might be smaller than state_variable[0], because state_variable[0]
# contains max_batch_size entries.
# Get the update indices for both states in the tuple
update_indices = (tf.range(0, tf.shape(new_state[0])[0]),
tf.range(0, tf.shape(new_state[1])[0]))
update_ops.extend([
tf.scatter_update(state_variable[0], update_indices[0], new_state[0]),
tf.scatter_update(state_variable[1], update_indices[1], new_state[1])
])
return tf.tuple(update_ops)
def test_state_grads():
with tf.Session() as sess:
v = tf.Variable([0., 0., 0.])
x = tf.ones((3,))
y0 = tf.assign(v, x)
y1 = tf.assign_add(v, x)
grad0 = tf.gradients(y0, [v, x])
grad1 = tf.gradients(y1, [v, x])
grad_vals = sess.run((grad0, grad1))
assert np.allclose(grad_vals[0][0], 0)
assert np.allclose(grad_vals[0][1], 1)
assert np.allclose(grad_vals[1][0], 1)
assert np.allclose(grad_vals[1][1], 1)
with tf.Session() as sess:
v = tf.Variable([0., 0., 0.])
x = tf.ones((1,))
y0 = tf.scatter_update(v, [0], x)
y1 = tf.scatter_add(v, [0], x)
grad0 = tf.gradients(y0, [v._ref(), x])
grad1 = tf.gradients(y1, [v._ref(), x])
grad_vals = sess.run((grad0, grad1))
assert np.allclose(grad_vals[0][0], [0, 1, 1])
assert np.allclose(grad_vals[0][1], 1)
assert np.allclose(grad_vals[1][0], 1)
assert np.allclose(grad_vals[1][1], 1)
def _scatter_f_var(self, dst, src, mode="update"):
# create a temporary variable for dst so that we can use the sparse
# variable updates. despite this looking incredibly inefficient, it is
# actually faster than the scatter_nd approach
# from tensorflow.python.ops import gen_state_ops
# var = gen_state_ops._temporary_variable(
# self.bases[dst.key].get_shape(), self.bases[dst.key].dtype)
# var_name = var.op.name
# var = tf.assign(var, self.bases[dst.key])
var = self.bases[dst.key]
if (dst.as_slice is not None and
var.get_shape().is_compatible_with(src.get_shape()) and
dst.indices[0] == 0 and
dst.indices[-1] == var.get_shape()[0].value - 1 and
len(dst.indices) == var.get_shape()[0]):
if mode == "inc":
result = tf.assign_add(var, src, use_locking=False)
else:
result = tf.assign(var, src, use_locking=False)
elif mode == "inc":
result = tf.scatter_add(var, dst.tf_indices, src,
use_locking=False)
else:
result = tf.scatter_update(var, dst.tf_indices, src,
use_locking=False)
# result = gen_state_ops._destroy_temporary_variable(var, var_name)
return result
def combine_messages(self, forward_messages, backward_messages, self_loop_messages, previous_code, mode='train'):
mtr_f = self.get_graph().forward_incidence_matrix(normalization=('none', 'recalculated'))
mtr_b = self.get_graph().backward_incidence_matrix(normalization=('none', 'recalculated'))
if mode == 'train':
forward_messages_comp = forward_messages - tf.nn.embedding_lookup(self.cached_messages_f, self.I)
backward_messages_comp = backward_messages - tf.nn.embedding_lookup(self.cached_messages_b, self.I)
with tf.control_dependencies([forward_messages, backward_messages]):
self.f_upd = tf.scatter_update(self.cached_messages_f, self.I, forward_messages)
self.b_upd = tf.scatter_update(self.cached_messages_b, self.I, backward_messages)
collected_messages_f = tf.sparse_tensor_dense_matmul(mtr_f, forward_messages_comp)
collected_messages_b = tf.sparse_tensor_dense_matmul(mtr_b, backward_messages_comp)
new_embedding = collected_messages_f + collected_messages_b
updated_vertex_embeddings = new_embedding + self.cached_vertex_embeddings
with tf.control_dependencies([updated_vertex_embeddings]):
self.v_upd = tf.assign(self.cached_vertex_embeddings, updated_vertex_embeddings)
else:
collected_messages_f = tf.sparse_tensor_dense_matmul(mtr_f, forward_messages)
collected_messages_b = tf.sparse_tensor_dense_matmul(mtr_b, backward_messages)
new_embedding = collected_messages_f + collected_messages_b
updated_vertex_embeddings = new_embedding
if self.use_nonlinearity:
activated = tf.nn.relu(updated_vertex_embeddings + self_loop_messages)
else:
activated = updated_vertex_embeddings + self_loop_messages
return activated
def floaty_scatter_update(ref, indices, updates, **kwargs):
return tf.scatter_update(ref, tf.to_int32(indices), updates, **kwargs)
def __create_embedding_ops(self, last_hidden):
if self.n_embeddings > 0: # Preallocate memory to save embeddings
self.embedding_var = tf.Variable(tf.zeros([self.n_embeddings, self.layers_size[-2]]), name='representation')
self.next_embedding = tf.Variable(tf.zeros([1], dtype=tf.int32), name="next_embedding_counter")
self.save_embedding_op = tf.scatter_update(self.embedding_var, self.next_embedding, last_hidden)
self.increment_next_embedding_op = self.next_embedding.assign_add(tf.constant([1]))
self.embeddings_saver = tf.train.Saver([self.embedding_var])
def update_diff(self, accuracy, batch_idxs, batch_losses, batch_plens, loss_w=0.5, smooth_w=0.5):
with tf.control_dependencies(
[tf.assign(self.acc_coef, accuracy)]
):
current_entropy = tf.gather(self.seq_entropy, batch_idxs)
loss_coef = batch_losses / (tf.reduce_max(batch_losses) + 1e-8)
new_entropy = (loss_coef * loss_w) + (batch_plens / self.max_plen * (1 - loss_w))
updated_entropy = (current_entropy * smooth_w) + (new_entropy * (1 - smooth_w))
update_op = tf.scatter_update(self.seq_entropy, batch_idxs, updated_entropy)
return update_op
def batch_norm_layer_in_time(x, max_length, step, is_training, epsilon=1e-3, decay=0.99, scope="layer"):
'''Assume 2d [batch, values] 3d [batch, width, values] or 4d [batch, width, height, values] tensor'''
with tf.variable_scope('bn_'+scope):
dim_x = len(x.get_shape().as_list())
size = x.get_shape().as_list()[dim_x-1]
step_idcs = tf.range(step*size, (step+1)*size)
scale_var = tf.get_variable('scale', [size * max_length], initializer=tf.constant_initializer(0.1))
scale = tf.gather(scale_var, step_idcs)
offset_var = tf.get_variable('offset', [size * max_length])
offset = tf.gather(offset_var, step_idcs)
pop_mean_var = tf.get_variable('pop_mean', [size * max_length], initializer=tf.zeros_initializer(), trainable=False)
pop_mean = tf.gather(pop_mean_var, step_idcs)
pop_var_var = tf.get_variable('pop_var', [size * max_length], initializer=tf.ones_initializer(), trainable=False)
pop_var = tf.gather(pop_var_var, step_idcs)
batch_mean, batch_var = tf.nn.moments(x, [i for i in range(dim_x-1)])
train_mean_op = tf.scatter_update(pop_mean_var, step_idcs, pop_mean * decay + batch_mean * (1 - decay))
train_var_op = tf.scatter_update(pop_var_var, step_idcs, pop_var * decay + batch_var * (1 - decay))
def batch_statistics():
with tf.control_dependencies([train_mean_op, train_var_op]):
return tf.nn.batch_normalization(x, batch_mean, batch_var, offset, scale, epsilon)
def population_statistics():
return tf.nn.batch_normalization(x, pop_mean, pop_var, offset, scale, epsilon)
if is_training:
return batch_statistics()
else:
return population_statistics()
def _curvature_range(self):
"""Curvature range.
Returns:
h_max_t, h_min_t ops
"""
self._curv_win = tf.get_variable("curv_win",
dtype=tf.float32,
trainable=False,
shape=[self.curvature_window_width,],
initializer=tf.zeros_initializer)
# We use log smoothing for curvature range
self._curv_win = tf.scatter_update(self._curv_win,
self._step % self.curvature_window_width,
tf.log(self._grad_norm_squared))
# Note here the iterations start from iteration 0
valid_window = tf.slice(self._curv_win,
tf.constant([0,]),
tf.expand_dims(
tf.minimum(
tf.constant(self.curvature_window_width),
self._step + 1), dim=0))
self._h_min_t = tf.reduce_min(valid_window)
self._h_max_t = tf.reduce_max(valid_window)
curv_range_ops = []
with tf.control_dependencies([self._h_min_t, self._h_max_t]):
avg_op = self._moving_averager.apply([self._h_min_t, self._h_max_t])
with tf.control_dependencies([avg_op]):
self._h_min = tf.exp(
tf.identity(self._moving_averager.average(self._h_min_t)))
self._h_max = tf.exp(
tf.identity(self._moving_averager.average(self._h_max_t)))
if self._sparsity_debias:
self._h_min *= self._sparsity_avg
self._h_max *= self._sparsity_avg
curv_range_ops.append(avg_op)
return curv_range_ops # h_max_t, h_min_t
def run(self, x, eta, idx_center=None, idx_sample=None):
h = [None] * self.num_layer
embeddings = []
reg_ops = []
reset_ops = []
clustering_ops = []
with tf.variable_scope(self.scope):
for ii in xrange(self.num_layer):
with tf.variable_scope('layer_{}'.format(ii)):
if ii == 0:
input_vec = x
else:
input_vec = h[ii - 1]
h[ii] = tf.matmul(input_vec, self.w[ii])
if self.add_bias:
h[ii] += self.b[ii]
if self.clustering_shape[ii] is not None:
embedding = h[ii]
embeddings += [embedding]
clustering_ops += [kmeans_clustering(embedding, self.cluster_center[
ii], self.cluster_label[ii], self.num_cluster[ii], eta)]
sample_center = tf.stop_gradient(
tf.gather(self.cluster_center[ii], self.cluster_label[ii]))
reg_ops += [tf.reduce_mean(
tf.square(embedding - sample_center)) * self.alpha[ii] / 2.0]
reset_ops += [tf.scatter_update(self.cluster_center[ii], idx_center[
ii], tf.gather(h[ii], idx_sample[ii]))]
if self.act_func and self.act_func[ii] is not None:
h[ii] = self.act_func[ii](h[ii])
return h, embeddings, clustering_ops, reg_ops, reset_ops
def curvature_range(self):
# set up the curvature window
self._curv_win = tf.Variable(
np.zeros([self._curv_win_width, ]), dtype=tf.float32,
name="curv_win", trainable=False)
# we can use log smoothing for curvature range to follow trend faster
# self._curv_win = tf.scatter_update(
# self._curv_win, self._global_step % self._curv_win_width,
# tf.log(self._grad_norm_squared + EPS))
self._curv_win = tf.scatter_update(
self._curv_win, self._global_step % self._curv_win_width,
self._grad_norm_squared + EPS)
# note here the iterations start from iteration 0
valid_window = tf.slice(
self._curv_win, tf.constant([0, ]), tf.expand_dims(
tf.minimum(tf.constant(self._curv_win_width),
self._global_step + 1), dim=0))
if self._h_min_log_smooth:
self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS)
else:
self._h_min_t = tf.reduce_min(valid_window)
if self._h_max_log_smooth:
self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS)
else:
self._h_max_t = tf.reduce_max(valid_window)
curv_range_ops = []
with tf.control_dependencies([self._h_min_t, self._h_max_t] ):
avg_op = self._moving_averager.apply(
[self._h_min_t, self._h_max_t])
with tf.control_dependencies([avg_op]):
if self._h_min_log_smooth:
self._h_min = tf.exp(
tf.identity(self._moving_averager.average(self._h_min_t)))
else:
self._h_min = \
tf.identity(self._moving_averager.average(self._h_min_t))
if self._h_max_log_smooth:
self._h_max = tf.exp(
tf.identity(self._moving_averager.average(self._h_max_t)))
else:
self._h_max = \
tf.identity(self._moving_averager.average(self._h_max_t))
if self._sparsity_debias:
self._h_min = self._h_min * self._sparsity_avg
self._h_max = self._h_max * self._sparsity_avg
curv_range_ops.append(avg_op)
return curv_range_ops
def cnn_sen_enc(word_vocab_size,
word_embed_size=50,
batch_size=20,
num_highway_layers=2,
max_sen_length=65,
kernels = [ 1, 2, 3, 4, 5, 6, 7],
kernel_features = [50, 100, 150, 200, 200, 200, 200],
max_doc_length=35,
pretrained=None):
# cnn sentence encoder
assert len(kernels) == len(kernel_features), 'Kernel and Features must have the same size'
input_ = tf.placeholder(tf.int32, shape=[batch_size, max_doc_length, max_sen_length], name="input")
''' First, embed words to sentence '''
with tf.variable_scope('embedding'):
if pretrained is not None:
word_embedding = tf.get_variable(name='word_embedding', shape=[word_vocab_size, word_embed_size],
initializer=tf.constant_initializer(pretrained))
else:
word_embedding = tf.get_variable(name='word_embedding', shape=[word_vocab_size, word_embed_size])
''' this op clears embedding vector of first symbol (symbol at position 0, which is by convention the position
of the padding symbol). It can be used to mimic Torch7 embedding operator that keeps padding mapped to
zero embedding vector and ignores gradient updates. For that do the following in TF:
1. after parameter initialization, apply this op to zero out padding embedding vector
2. after each gradient update, apply this op to keep padding at zero'''
clear_word_embedding_padding = tf.scatter_update(word_embedding, [0], tf.constant(0.0, shape=[1, word_embed_size]))
# [batch_size, max_doc_length, max_sen_length, word_embed_size]
input_embedded = tf.nn.embedding_lookup(word_embedding, input_)
input_embedded = tf.reshape(input_embedded, [-1, max_sen_length, word_embed_size])
''' Second, apply convolutions '''
# [batch_size x max_doc_length, cnn_size] # where cnn_size=sum(kernel_features)
input_cnn = tdnn(input_embedded, kernels, kernel_features)
''' Maybe apply Highway '''
if num_highway_layers > 0:
input_cnn = highway(input_cnn, input_cnn.get_shape()[-1], num_layers=num_highway_layers)
return adict(
input = input_,
clear_word_embedding_padding=clear_word_embedding_padding,
input_embedded=input_embedded,
input_cnn=input_cnn
)
def cnn_sen_enc(word_vocab_size,
word_embed_size=50,
batch_size=20,
num_highway_layers=2,
max_sen_length=65,
kernels = [ 1, 2, 3, 4, 5, 6, 7],
kernel_features = [50, 100, 150, 200, 200, 200, 200],
max_doc_length=35,
pretrained=None):
# cnn sentence encoder
assert len(kernels) == len(kernel_features), 'Kernel and Features must have the same size'
input_ = tf.placeholder(tf.int32, shape=[batch_size, max_doc_length, max_sen_length], name="input")
''' First, embed words to sentence '''
with tf.variable_scope('Embedding'):
if pretrained is not None:
word_embedding = tf.get_variable(name='word_embedding', shape=[word_vocab_size, word_embed_size],
initializer=tf.constant_initializer(pretrained))
else:
word_embedding = tf.get_variable(name='word_embedding', shape=[word_vocab_size, word_embed_size])
''' this op clears embedding vector of first symbol (symbol at position 0, which is by convention the position
of the padding symbol). It can be used to mimic Torch7 embedding operator that keeps padding mapped to
zero embedding vector and ignores gradient updates. For that do the following in TF:
1. after parameter initialization, apply this op to zero out padding embedding vector
2. after each gradient update, apply this op to keep padding at zero'''
clear_word_embedding_padding = tf.scatter_update(word_embedding, [0], tf.constant(0.0, shape=[1, word_embed_size]))
# [batch_size, max_doc_length, max_sen_length, word_embed_size]
input_embedded = tf.nn.embedding_lookup(word_embedding, input_)
input_embedded = tf.reshape(input_embedded, [-1, max_sen_length, word_embed_size])
''' Second, apply convolutions '''
# [batch_size x max_doc_length, cnn_size] # where cnn_size=sum(kernel_features)
input_cnn = tdnn(input_embedded, kernels, kernel_features)
''' Maybe apply Highway '''
if num_highway_layers > 0:
input_cnn = highway(input_cnn, input_cnn.get_shape()[-1], num_layers=num_highway_layers)
return adict(
input = input_,
clear_word_embedding_padding=clear_word_embedding_padding,
input_embedded=input_embedded,
input_cnn=input_cnn
)