def apply_time_pooling(inputs, sequence_length, stride, pooling_avg=False):
shape = [tf.shape(inputs)[0], tf.shape(inputs)[1], inputs.get_shape()[2].value]
if pooling_avg:
inputs_ = [inputs[:, i::stride, :] for i in range(stride)]
max_len = tf.shape(inputs_[0])[1]
for k in range(1, stride):
len_ = tf.shape(inputs_[k])[1]
paddings = tf.stack([[0, 0], [0, max_len - len_], [0, 0]])
inputs_[k] = tf.pad(inputs_[k], paddings=paddings)
inputs = tf.reduce_sum(inputs_, axis=0) / len(inputs_)
else:
inputs = inputs[:, ::stride, :]
inputs = tf.reshape(inputs, tf.stack([shape[0], tf.shape(inputs)[1], shape[2]]))
sequence_length = (sequence_length + stride - 1) // stride # rounding up
return inputs, sequence_length
python类stack()的实例源码
def reinforce_baseline(decoder_states, reward):
"""
Center the reward by computing a baseline reward over decoder states.
:param decoder_states: internal states of the decoder, tensor of shape (batch_size, time_steps, state_size)
:param reward: reward for each time step, tensor of shape (batch_size, time_steps)
:return: reward - computed baseline, tensor of shape (batch_size, time_steps)
"""
# batch_size = tf.shape(decoder_states)[0]
# time_steps = tf.shape(decoder_states)[1]
# state_size = decoder_states.get_shape()[2]
# states = tf.reshape(decoder_states, shape=tf.stack([batch_size * time_steps, state_size]))
baseline = dense(tf.stop_gradient(decoder_states), units=1, activation=None, name='reward_baseline',
kernel_initializer=tf.constant_initializer(0.01))
baseline = tf.squeeze(baseline, axis=2)
# baseline = tf.reshape(baseline, shape=tf.stack([batch_size, time_steps]))
return reward - baseline
def zoomout(image, gt_bboxes, params):
X_out = tf.random_uniform([], 1.05, params['X_out'])
h, w, _ = tf.unstack(tf.to_float(tf.shape(image)))
zoomout_color = params['zoomout_color']+[0]
bg_color = tf.constant(zoomout_color, dtype=tf.float32)
x_shift = tf.random_uniform([], 0, (X_out - 1) * w)
y_shift = tf.random_uniform([], 0, (X_out - 1) * h)
x2_shift = (X_out - 1) * w - x_shift
y2_shift = (X_out - 1) * h - y_shift
# somewhat hacky solution to pad with MEAN_COLOR
# tf.pad does not support custom constant padding unlike numpy
image -= bg_color
image = tf.pad(image, tf.to_int32([[y_shift, y2_shift], [x_shift, x2_shift], [0, 0]]))
image += bg_color
gt_x, gt_y, gt_w, gt_h = tf.unstack(gt_bboxes, axis=1)
gt_bboxes = tf.stack([gt_x + x_shift/w,
gt_y + y_shift/h,
gt_w, gt_h], axis=1)/X_out
return image, gt_bboxes
def encode_bboxes_tf(proposals, gt, config):
"""Encode bbox coordinates in a format
used for computing the loss"""
prop_x = proposals[..., 0]
prop_y = proposals[..., 1]
prop_w = proposals[..., 2]
prop_h = proposals[..., 3]
gt_x = gt[..., 0]
gt_y = gt[..., 1]
gt_w = gt[..., 2]
gt_h = gt[..., 3]
diff_x = (gt_x + 0.5*gt_w - prop_x - 0.5*prop_w)/prop_w
diff_y = (gt_y + 0.5*gt_h - prop_y - 0.5*prop_h)/prop_h
diff_w = tf.log(gt_w/prop_w)
diff_h = tf.log(gt_h/prop_h)
var_x, var_y, var_w, var_h = config['prior_variance']
x = tf.stack([diff_x/var_x, diff_y/var_y, diff_w/var_w, diff_h/var_h], -1)
return x
def decode_bboxes(tcoords, anchors):
var_x, var_y, var_w, var_h = config['prior_variance']
t_x = tcoords[:, 0]*var_x
t_y = tcoords[:, 1]*var_y
t_w = tcoords[:, 2]*var_w
t_h = tcoords[:, 3]*var_h
a_w = anchors[:, 2]
a_h = anchors[:, 3]
a_x = anchors[:, 0]+a_w/2
a_y = anchors[:, 1]+a_h/2
x = t_x*a_w + a_x
y = t_y*a_h + a_y
w = tf.exp(t_w)*a_w
h = tf.exp(t_h)*a_h
x1 = tf.maximum(0., x - w/2)
y1 = tf.maximum(0., y - h/2)
x2 = tf.minimum(1., w + x1)
y2 = tf.minimum(1., h + y1)
return tf.stack([y1, x1, y2, x2], axis=1)
def combine_gradients(tower_grads):
"""Calculate the combined gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been summed
across all towers.
"""
filtered_grads = [[x for x in grad_list if x[0] is not None] for grad_list in tower_grads]
final_grads = []
for i in xrange(len(filtered_grads[0])):
grads = [filtered_grads[t][i] for t in xrange(len(filtered_grads))]
grad = tf.stack([x[0] for x in grads], 0)
grad = tf.reduce_sum(grad, 0)
final_grads.append((grad, filtered_grads[0][i][1],))
return final_grads
def combine_gradients(tower_grads):
"""Calculate the combined gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been summed
across all towers.
"""
filtered_grads = [[x for x in grad_list if x[0] is not None] for grad_list in tower_grads]
final_grads = []
for i in xrange(len(filtered_grads[0])):
grads = [filtered_grads[t][i] for t in xrange(len(filtered_grads))]
grad = tf.stack([x[0] for x in grads], 0)
grad = tf.reduce_sum(grad, 0)
final_grads.append((grad, filtered_grads[0][i][1],))
return final_grads
def combine_gradients(tower_grads):
"""Calculate the combined gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been summed
across all towers.
"""
filtered_grads = [[x for x in grad_list if x[0] is not None] for grad_list in tower_grads]
final_grads = []
for i in xrange(len(filtered_grads[0])):
grads = [filtered_grads[t][i] for t in xrange(len(filtered_grads))]
grad = tf.stack([x[0] for x in grads], 0)
grad = tf.reduce_sum(grad, 0)
final_grads.append((grad, filtered_grads[0][i][1],))
return final_grads
def SampleRandomFrames(model_input, num_frames, num_samples):
"""Samples a random set of frames of size num_samples.
Args:
model_input: A tensor of size batch_size x max_frames x feature_size
num_frames: A tensor of size batch_size x 1
num_samples: A scalar
Returns:
`model_input`: A tensor of size batch_size x num_samples x feature_size
"""
batch_size = tf.shape(model_input)[0]
frame_index = tf.cast(
tf.multiply(
tf.random_uniform([batch_size, num_samples]),
tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32)
batch_index = tf.tile(
tf.expand_dims(tf.range(batch_size), 1), [1, num_samples])
index = tf.stack([batch_index, frame_index], 2)
return tf.gather_nd(model_input, index)
def _flat_reconstruction_loss(self, flat_x_target, flat_rnn_output):
split_x_target = tf.split(flat_x_target, self._output_depths, axis=-1)
split_rnn_output = tf.split(
flat_rnn_output, self._output_depths, axis=-1)
losses = []
truths = []
predictions = []
metric_map = {}
for i in range(len(self._output_depths)):
l, m, t, p = (
super(MultiOutCategoricalLstmDecoder, self)._flat_reconstruction_loss(
split_x_target[i], split_rnn_output[i]))
losses.append(l)
truths.append(t)
predictions.append(p)
for k, v in m.items():
metric_map['%s_%d' % (k, i)] = v
return (tf.reduce_sum(losses, axis=0),
metric_map,
tf.stack(truths),
tf.stack(predictions))
def loss_wrapper(y, y_, loss_function, transitions=None, nums_tags=None, batch_size=None, weights=None, average_cross_steps=True):
assert len(y) == len(y_)
total_loss = []
if loss_function is crf_loss:
#print len(y), len(transitions), len(nums_tags)
assert len(y) == len(transitions) and len(transitions) == len(nums_tags) and batch_size is not None
for sy, sy_, stranstion, snums_tags in zip(y, y_, transitions, nums_tags):
total_loss.append(loss_function(sy, sy_, stranstion, snums_tags, batch_size))
elif loss_function is cross_entropy:
assert len(y) == len(nums_tags)
for sy, sy_, snums_tags in zip(y, y_, nums_tags):
total_loss.append(loss_function(sy, sy_, snums_tags))
elif loss_function is sparse_cross_entropy:
for sy, sy_ in zip(y, y_):
total_loss.append(loss_function(sy, sy_))
elif loss_function is sparse_cross_entropy_with_weights:
assert len(y) == len(nums_tags)
for sy, sy_, snums_tags in zip(y, y_):
total_loss.append(tf.reshape(loss_function(sy, sy_, weights=weights, average_cross_steps=average_cross_steps), [-1]))
else:
for sy, sy_ in zip(y, y_):
total_loss.append(tf.reshape(loss_function(sy, sy_), [-1]))
return tf.stack(total_loss)
def combine_gradients(tower_grads):
"""Calculate the combined gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been summed
across all towers.
"""
filtered_grads = [[x for x in grad_list if x[0] is not None] for grad_list in tower_grads]
final_grads = []
for i in range(len(filtered_grads[0])):
grads = [filtered_grads[t][i] for t in range(len(filtered_grads))]
grad = tf.stack([x[0] for x in grads], 0)
grad = tf.reduce_sum(grad, 0)
final_grads.append((grad, filtered_grads[0][i][1],))
return final_grads
def SampleRandomFrames(model_input, num_frames, num_samples):
"""Samples a random set of frames of size num_samples.
Args:
model_input: A tensor of size batch_size x max_frames x feature_size
num_frames: A tensor of size batch_size x 1
num_samples: A scalar
Returns:
`model_input`: A tensor of size batch_size x num_samples x feature_size
"""
batch_size = tf.shape(model_input)[0]
frame_index = tf.cast(
tf.multiply(
tf.random_uniform([batch_size, num_samples]),
tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32)
batch_index = tf.tile(
tf.expand_dims(tf.range(batch_size), 1), [1, num_samples])
index = tf.stack([batch_index, frame_index], 2)
return tf.gather_nd(model_input, index)
def __call__(self, input_layer, output_size, scope=None, in_dim=None, stddev=0.02, bias_start=0.0):
shape = input_layer.shape
input_ = input_layer.tensor
if True:#try:
if len(shape) == 4:
input_ = tf.reshape(input_, tf.stack([tf.shape(input_)[0], np.prod(shape[1:])]))
input_.set_shape([None, np.prod(shape[1:])])
shape = input_.get_shape().as_list()
with tf.variable_scope(scope or "Linear"):
matrix = self.variable("Matrix", [in_dim or shape[1], output_size], dt=tf.float32,
init=tf.random_normal_initializer(stddev=stddev))
bias = self.variable("bias", [output_size], init=tf.constant_initializer(bias_start))
return input_layer.with_tensor(tf.matmul(input_, matrix) + bias, parameters=self.vars)
#except Exception:
# import ipdb; ipdb.set_trace()
def transition(self, curr_state, next_symbols, batch_size):
with tf.name_scope('grammar_transition'):
transitions = tf.gather(tf.constant(self.transition_matrix), curr_state)
assert transitions.get_shape()[1:] == (self.output_size,)
indices = tf.stack((tf.range(0, batch_size), next_symbols), axis=1)
next_state = tf.gather_nd(transitions, indices)
return next_state
def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
# The second dimension of labels must be equal to the longest label length in the batch
correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
with tf.control_dependencies([correct_shape_assert]):
labels = tf.identity(labels)
label_shape = tf.shape(labels)
num_batches_tns = tf.stack([label_shape[0]])
max_num_labels_tns = tf.stack([label_shape[1]])
def range_less_than(previous_state, current_input):
return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
init = tf.expand_dims(init, 0)
dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
dense_mask = dense_mask[:, 0, :]
label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
label_shape)
label_ind = tf.boolean_mask(label_array, dense_mask)
batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
batch_ind = tf.boolean_mask(batch_array, dense_mask)
indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
shape = [batch_size, tf.reduce_max(label_lengths)]
vals_sparse = gather_nd(labels, indices, shape)
return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
# Validate and normalize transcriptions. Returns a cleaned version of the label
# or None if it's invalid.
def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
# The second dimension of labels must be equal to the longest label length in the batch
correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
with tf.control_dependencies([correct_shape_assert]):
labels = tf.identity(labels)
label_shape = tf.shape(labels)
num_batches_tns = tf.stack([label_shape[0]])
max_num_labels_tns = tf.stack([label_shape[1]])
def range_less_than(previous_state, current_input):
return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
init = tf.expand_dims(init, 0)
dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
dense_mask = dense_mask[:, 0, :]
label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
label_shape)
label_ind = tf.boolean_mask(label_array, dense_mask)
batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
batch_ind = tf.boolean_mask(batch_array, dense_mask)
indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
shape = [batch_size, tf.reduce_max(label_lengths)]
vals_sparse = gather_nd(labels, indices, shape)
return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
# Validate and normalize transcriptions. Returns a cleaned version of the label
# or None if it's invalid.
def create_model(self,
model_input,
vocab_size,
num_frames,
**unused_params):
shape = model_input.get_shape().as_list()
frames_sum = tf.reduce_sum(tf.abs(model_input),axis=2)
frames_true = tf.ones(tf.shape(frames_sum))
frames_false = tf.zeros(tf.shape(frames_sum))
frames_bool = tf.reshape(tf.where(tf.greater(frames_sum, frames_false), frames_true, frames_false),[-1,shape[1],1])
activation_1 = tf.reduce_max(model_input, axis=1)
activation_2 = tf.reduce_sum(model_input*frames_bool, axis=1)/(tf.reduce_sum(frames_bool, axis=1)+1e-6)
activation_3 = tf.reduce_min(model_input, axis=1)
model_input_1, final_probilities_1 = self.sub_moe(activation_1,vocab_size,scopename="_max")
model_input_2, final_probilities_2 = self.sub_moe(activation_2,vocab_size,scopename="_mean")
model_input_3, final_probilities_3 = self.sub_moe(activation_3,vocab_size,scopename="_min")
final_probilities = tf.stack((final_probilities_1,final_probilities_2,final_probilities_3),axis=1)
weight2d = tf.get_variable("ensemble_weight2d",
shape=[shape[2], 3, vocab_size],
regularizer=slim.l2_regularizer(1.0e-8))
activations = tf.stack((model_input_1, model_input_2, model_input_3), axis=2)
weight = tf.nn.softmax(tf.einsum("aij,ijk->ajk", activations, weight2d), dim=1)
result = {}
result["prediction_frames"] = tf.reshape(final_probilities,[-1,vocab_size])
result["predictions"] = tf.reduce_sum(final_probilities*weight,axis=1)
return result
def rnn(self, model_input, lstm_size, num_frames,sub_scope="", **unused_params):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
## Batch normalize the input
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.BasicLSTMCell(
lstm_size, forget_bias=1.0, state_is_tuple=True)
for _ in range(1)
],
state_is_tuple=True)
with tf.variable_scope("RNN-"+sub_scope):
outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
sequence_length=num_frames,
swap_memory=True,
dtype=tf.float32)
state_out = tf.concat(map(lambda x: x.c, state), axis=1)
return state_out
def create_model(self, model_input, vocab_size, num_frames, l2_penalty=1e-8, **unused_params):
num_extend = FLAGS.moe_num_extend
num_layers = num_extend
lstm_size = FLAGS.lstm_cells
pool_size=2
cnn_input = model_input
num_filters=[256,256,512]
filter_sizes=[1,2,3]
features_size = sum(num_filters)
final_probilities = []
moe_inputs = []
for layer in range(num_layers):
cnn_output, num_t = self.cnn(cnn_input, num_filters=num_filters, filter_sizes=filter_sizes, sub_scope="cnn%d"%(layer+1))
cnn_output = tf.nn.relu(cnn_output)
cnn_multiscale = self.rnn(cnn_output,lstm_size, num_frames,sub_scope="rnn%d"%(layer+1))
moe_inputs.append(cnn_multiscale)
final_probility = self.sub_moe(cnn_multiscale,vocab_size,scopename="moe%d"%(layer+1))
final_probilities.append(final_probility)
num_t = pool_size*(num_t//pool_size)
cnn_output = tf.reshape(cnn_output[:,:num_t,:],[-1,num_t//pool_size,pool_size,features_size])
cnn_input = tf.reduce_max(cnn_output, axis=2)
num_frames = tf.maximum(num_frames//pool_size,1)
final_probilities = tf.stack(final_probilities,axis=1)
moe_inputs = tf.stack(moe_inputs,axis=1)
weight2d = tf.get_variable("ensemble_weight2d",
shape=[num_extend, features_size, vocab_size],
regularizer=slim.l2_regularizer(1.0e-8))
weight = tf.nn.softmax(tf.einsum("aij,ijk->aik", moe_inputs, weight2d), dim=1)
result = {}
result["prediction_frames"] = tf.reshape(final_probilities,[-1,vocab_size])
result["predictions"] = tf.reduce_sum(final_probilities*weight,axis=1)
return result