def sparse_tuple_from(sequences, dtype=np.int32):
r"""Creates a sparse representention of ``sequences``.
Args:
* sequences: a list of lists of type dtype where each element is a sequence
Returns a tuple with (indices, values, shape)
"""
indices = []
values = []
for n, seq in enumerate(sequences):
indices.extend(zip([n]*len(seq), range(len(seq))))
values.extend(seq)
indices = np.asarray(indices, dtype=np.int64)
values = np.asarray(values, dtype=dtype)
shape = np.asarray([len(sequences), indices.max(0)[1]+1], dtype=np.int64)
return tf.SparseTensor(indices=indices, values=values, shape=shape)
python类shape()的实例源码
def SampleRandomFrames(model_input, num_frames, num_samples):
"""Samples a random set of frames of size num_samples.
Args:
model_input: A tensor of size batch_size x max_frames x feature_size
num_frames: A tensor of size batch_size x 1
num_samples: A scalar
Returns:
`model_input`: A tensor of size batch_size x num_samples x feature_size
"""
batch_size = tf.shape(model_input)[0]
frame_index = tf.cast(
tf.multiply(
tf.random_uniform([batch_size, num_samples]),
tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32)
batch_index = tf.tile(
tf.expand_dims(tf.range(batch_size), 1), [1, num_samples])
index = tf.stack([batch_index, frame_index], 2)
return tf.gather_nd(model_input, index)
def dense(inputs, units, bias_shape, w_i, b_i=None, activation=tf.nn.relu):
# ??tf.layers?????flatten
# dense1 = tf.layers.dense(tf.contrib.layers.flatten(relu5), activation=tf.nn.relu, units=50)
if not isinstance(inputs, ops.Tensor):
inputs = ops.convert_to_tensor(inputs, dtype='float')
# dim_list = inputs.get_shape().as_list()
# flatten_shape = dim_list[1] if len(dim_list) <= 2 else reduce(lambda x, y: x * y, dim_list[1:])
# reshaped = tf.reshape(inputs, [dim_list[0], flatten_shape])
if len(inputs.shape) > 2:
inputs = tf.contrib.layers.flatten(inputs)
flatten_shape = inputs.shape[1]
weights = tf.get_variable('weights', shape=[flatten_shape, units], initializer=w_i)
dense = tf.matmul(inputs, weights)
if bias_shape is not None:
assert bias_shape[0] == units
biases = tf.get_variable('biases', shape=bias_shape, initializer=b_i)
return activation(dense + biases) if activation is not None else dense + biases
return activation(dense) if activation is not None else dense
def switch(condition, then_tensor, else_tensor):
"""
Keras' implementation of switch for tensorflow uses tf.switch which accepts only scalar conditions.
It should use tf.select instead.
"""
if K.backend() == 'tensorflow':
import tensorflow as tf
condition_shape = condition.get_shape()
input_shape = then_tensor.get_shape()
if condition_shape[-1] != input_shape[-1] and condition_shape[-1] == 1:
# This means the last dim is an embedding dim. Keras does not mask this dimension. But tf wants
# the condition and the then and else tensors to be the same shape.
condition = K.dot(tf.cast(condition, tf.float32), tf.ones((1, input_shape[-1])))
return tf.select(tf.cast(condition, dtype=tf.bool), then_tensor, else_tensor)
else:
import theano.tensor as T
return T.switch(condition, then_tensor, else_tensor)
beam_aligner.py 文件源码
项目:almond-nnparser
作者: Stanford-Mobisocial-IoT-Lab
项目源码
文件源码
阅读 46
收藏 0
点赞 0
评论 0
def finalize(self, outputs : BeamSearchOptimizationDecoderOutput, final_state : BeamSearchOptimizationDecoderState, sequence_lengths):
# all output fields are [max_time, batch_size, ...]
predicted_ids = tf.contrib.seq2seq.gather_tree(
outputs.predicted_ids, outputs.parent_ids,
sequence_length=sequence_lengths, name='predicted_ids')
total_loss = tf.reduce_sum(outputs.loss, axis=0, name='violation_loss')
predicted_time = tf.shape(predicted_ids)[0]
last_score = predicted_time-1
with tf.name_scope('gold_score'):
gold_score = outputs.gold_score[last_score]
with tf.name_scope('sequence_scores'):
sequence_scores = outputs.scores[last_score]
return FinalBeamSearchOptimizationDecoderOutput(beam_search_decoder_output=outputs,
predicted_ids=predicted_ids,
scores=sequence_scores,
gold_score=gold_score,
gold_beam_id=final_state.gold_beam_id,
num_available_beams=final_state.num_available_beams,
total_violation_loss=total_loss), final_state
def variable_on_worker_level(name, shape, initializer):
r'''
Next we concern ourselves with graph creation.
However, before we do so we must introduce a utility function ``variable_on_worker_level()``
used to create a variable in CPU memory.
'''
# Use the /cpu:0 device on worker_device for scoped operations
if len(FLAGS.ps_hosts) == 0:
device = worker_device
else:
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster)
with tf.device(device):
# Create or get apropos variable
var = tf.get_variable(name=name, shape=shape, initializer=initializer)
return var
def sparse_tuple_from(sequences, dtype=np.int32):
r"""Creates a sparse representention of ``sequences``.
Args:
* sequences: a list of lists of type dtype where each element is a sequence
Returns a tuple with (indices, values, shape)
"""
indices = []
values = []
for n, seq in enumerate(sequences):
indices.extend(zip([n]*len(seq), range(len(seq))))
values.extend(seq)
indices = np.asarray(indices, dtype=np.int64)
values = np.asarray(values, dtype=dtype)
shape = np.asarray([len(sequences), indices.max(0)[1]+1], dtype=np.int64)
return tf.SparseTensor(indices=indices, values=values, shape=shape)
def variable_on_worker_level(name, shape, initializer):
r'''
Next we concern ourselves with graph creation.
However, before we do so we must introduce a utility function ``variable_on_worker_level()``
used to create a variable in CPU memory.
'''
# Use the /cpu:0 device on worker_device for scoped operations
if len(FLAGS.ps_hosts) == 0:
device = worker_device
else:
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster)
with tf.device(device):
# Create or get apropos variable
var = tf.get_variable(name=name, shape=shape, initializer=initializer)
return var
def variable_on_worker_level(name, shape, initializer):
r'''
Next we concern ourselves with graph creation.
However, before we do so we must introduce a utility function ``variable_on_worker_level()``
used to create a variable in CPU memory.
'''
# Use the /cpu:0 device on worker_device for scoped operations
if len(FLAGS.ps_hosts) == 0:
device = worker_device
else:
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster)
with tf.device(device):
# Create or get apropos variable
var = tf.get_variable(name=name, shape=shape, initializer=initializer)
return var
def highway(self, input_1, input_2, size_1, size_2, l2_penalty=1e-8, layer_size=1):
output = input_2
for idx in range(layer_size):
with tf.name_scope('output_lin_%d' % idx):
W = tf.Variable(tf.truncated_normal([size_2,size_1], stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[size_1]), name="b")
tf.add_to_collection(name=tf.GraphKeys.REGULARIZATION_LOSSES, value=l2_penalty*tf.nn.l2_loss(W))
tf.add_to_collection(name=tf.GraphKeys.REGULARIZATION_LOSSES, value=l2_penalty*tf.nn.l2_loss(b))
output = tf.nn.relu(tf.nn.xw_plus_b(output,W,b))
with tf.name_scope('transform_lin_%d' % idx):
W = tf.Variable(tf.truncated_normal([size_1,size_1], stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[size_1]), name="b")
tf.add_to_collection(name=tf.GraphKeys.REGULARIZATION_LOSSES, value=l2_penalty*tf.nn.l2_loss(W))
tf.add_to_collection(name=tf.GraphKeys.REGULARIZATION_LOSSES, value=l2_penalty*tf.nn.l2_loss(b))
transform_gate = tf.sigmoid(tf.nn.xw_plus_b(input_1,W,b))
carry_gate = tf.constant(1.0) - transform_gate
output = transform_gate * output + carry_gate * input_1
return output
def calculate_loss_distill_boost(self, predictions, labels_distill, labels, **unused_params):
with tf.name_scope("loss_distill_boost"):
print("loss_distill_boost")
epsilon = 10e-6
float_labels = tf.cast(labels, tf.float32)
batch_size = tf.shape(float_labels)[0]
float_labels_distill = tf.cast(labels_distill, tf.float32)
error = tf.negative(float_labels * tf.log(float_labels_distill + epsilon) + (
1 - float_labels) * tf.log(1 - float_labels_distill + epsilon))
error = tf.reduce_sum(error,axis=1,keep_dims=True)
alpha = error / tf.reduce_sum(error) * tf.cast(batch_size,dtype=tf.float32)
alpha = tf.clip_by_value(alpha, 0.5, 5)
alpha = alpha / tf.reduce_sum(alpha) * tf.cast(batch_size,dtype=tf.float32)
cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + (
1 - float_labels) * tf.log(1 - predictions + epsilon)
cross_entropy_loss = tf.negative(cross_entropy_loss * alpha)
return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1))
def calculate_loss_distill_relabel(self, predictions, labels_distill, labels, **unused_params):
with tf.name_scope("loss_distill_relabel"):
print("loss_distill_relabel")
epsilon = 10e-6
float_labels = tf.cast(labels, tf.float32)
sum_labels = tf.cast(tf.reduce_sum(float_labels),dtype=tf.int32)
pos_distill, _ = tf.nn.top_k(tf.reshape(labels_distill,[-1]), k=sum_labels)
labels_true = tf.ones(tf.shape(labels))
labels_false = tf.zeros(tf.shape(labels))
labels_add = tf.where(tf.greater_equal(labels_distill, pos_distill[-1]), labels_true, labels_false)
print(labels_add.get_shape().as_list())
float_labels = float_labels+labels_add*(1.0-float_labels)
cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + (
1 - float_labels) * tf.log(1 - predictions + epsilon)
cross_entropy_loss = tf.negative(cross_entropy_loss)
return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1))
def FramePooling(frames, method, **unused_params):
"""Pools over the frames of a video.
Args:
frames: A tensor with shape [batch_size, num_frames, feature_size].
method: "average", "max", "attention", or "none".
Returns:
A tensor with shape [batch_size, feature_size] for average, max, or
attention pooling. A tensor with shape [batch_size*num_frames, feature_size]
for none pooling.
Raises:
ValueError: if method is other than "average", "max", "attention", or
"none".
"""
if method == "average":
return tf.reduce_mean(frames, 1)
elif method == "max":
return tf.reduce_max(frames, 1)
elif method == "none":
feature_size = frames.shape_as_list()[2]
return tf.reshape(frames, [-1, feature_size])
else:
raise ValueError("Unrecognized pooling method: %s" % method)
def calculate_loss(self, predictions, support_predictions, labels, **unused_params):
"""
support_predictions batch_size x num_models x num_classes
predictions = tf.reduce_mean(support_predictions, axis=1)
"""
model_count = tf.shape(support_predictions)[1]
vocab_size = tf.shape(support_predictions)[2]
mean_predictions = tf.reduce_mean(support_predictions, axis=1, keep_dims=True)
support_labels = tf.tile(tf.expand_dims(tf.cast(labels, dtype=tf.float32), axis=1), multiples=[1,model_count,1])
support_means = tf.stop_gradient(tf.tile(mean_predictions, multiples=[1,model_count,1]))
support_predictions = tf.reshape(support_predictions, shape=[-1,model_count*vocab_size])
support_labels = tf.reshape(support_labels, shape=[-1,model_count*vocab_size])
support_means = tf.reshape(support_means, shape=[-1,model_count*vocab_size])
ce_loss_fn = CrossEntropyLoss()
# The cross entropy between predictions and ground truth
cross_entropy_loss = ce_loss_fn.calculate_loss(support_predictions, support_labels, **unused_params)
# The cross entropy between predictions and mean predictions
divergence = ce_loss_fn.calculate_loss(support_predictions, support_means, **unused_params)
loss = cross_entropy_loss * (1.0 - FLAGS.support_loss_percent) - divergence * FLAGS.support_loss_percent
return loss
def FramePooling(frames, method, **unused_params):
"""Pools over the frames of a video.
Args:
frames: A tensor with shape [batch_size, num_frames, feature_size].
method: "average", "max", "attention", or "none".
Returns:
A tensor with shape [batch_size, feature_size] for average, max, or
attention pooling. A tensor with shape [batch_size*num_frames, feature_size]
for none pooling.
Raises:
ValueError: if method is other than "average", "max", "attention", or
"none".
"""
if method == "average":
return tf.reduce_mean(frames, 1)
elif method == "max":
return tf.reduce_max(frames, 1)
elif method == "none":
feature_size = frames.shape_as_list()[2]
return tf.reshape(frames, [-1, feature_size])
else:
raise ValueError("Unrecognized pooling method: %s" % method)
def resize_axis(tensor, axis, new_size, fill_value=0):
tensor = tf.convert_to_tensor(tensor)
shape = tf.unstack(tf.shape(tensor))
pad_shape = shape[:]
pad_shape[axis] = tf.maximum(0, new_size - shape[axis])
shape[axis] = tf.minimum(shape[axis], new_size)
shape = tf.stack(shape)
resized = tf.concat([
tf.slice(tensor, tf.zeros_like(shape), shape),
tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))
], axis)
# Update shape.
new_shape = tensor.get_shape().as_list() # A copy is being made.
new_shape[axis] = new_size
resized.set_shape(new_shape)
return resized
def prepare_reader(self, filename_queue, batch_size=1024):
reader = tf.TFRecordReader()
_, serialized_examples = reader.read_up_to(filename_queue, batch_size)
# set the mapping from the fields to data types in the proto
num_features = len(self.feature_names)
assert num_features > 0, "self.feature_names is empty!"
assert len(self.feature_names) == len(self.feature_sizes), \
"length of feature_names (={}) != length of feature_sizes (={})".format( \
len(self.feature_names), len(self.feature_sizes))
feature_map = {"video_id": tf.FixedLenFeature([], tf.string),
"labels": tf.VarLenFeature(tf.int64)}
for feature_index in range(num_features):
feature_map[self.feature_names[feature_index]] = tf.FixedLenFeature(
[self.feature_sizes[feature_index]], tf.float32)
features = tf.parse_example(serialized_examples, features=feature_map)
labels = tf.sparse_to_indicator(features["labels"], self.num_classes)
labels.set_shape([None, self.num_classes])
concatenated_features = tf.concat([
features[feature_name] for feature_name in self.feature_names], 1)
return features["video_id"], concatenated_features, labels, tf.ones([tf.shape(serialized_examples)[0]])
def SampleRandomFrames(model_input, num_frames, num_samples):
"""Samples a random set of frames of size num_samples.
Args:
model_input: A tensor of size batch_size x max_frames x feature_size
num_frames: A tensor of size batch_size x 1
num_samples: A scalar
Returns:
`model_input`: A tensor of size batch_size x num_samples x feature_size
"""
batch_size = tf.shape(model_input)[0]
frame_index = tf.cast(
tf.multiply(
tf.random_uniform([batch_size, num_samples]),
tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32)
batch_index = tf.tile(
tf.expand_dims(tf.range(batch_size), 1), [1, num_samples])
index = tf.stack([batch_index, frame_index], 2)
return tf.gather_nd(model_input, index)
def FramePooling(frames, method, **unused_params):
"""Pools over the frames of a video.
Args:
frames: A tensor with shape [batch_size, num_frames, feature_size].
method: "average", "max", "attention", or "none".
Returns:
A tensor with shape [batch_size, feature_size] for average, max, or
attention pooling. A tensor with shape [batch_size*num_frames, feature_size]
for none pooling.
Raises:
ValueError: if method is other than "average", "max", "attention", or
"none".
"""
if method == "average":
return tf.reduce_mean(frames, 1)
elif method == "max":
return tf.reduce_max(frames, 1)
elif method == "none":
feature_size = frames.shape_as_list()[2]
return tf.reshape(frames, [-1, feature_size])
else:
raise ValueError("Unrecognized pooling method: %s" % method)
def sample_dtype(self):
return tf.int32
# WRONG SECOND DERIVATIVES
# class CategoricalPd(Pd):
# def __init__(self, logits):
# self.logits = logits
# self.ps = tf.nn.softmax(logits)
# @classmethod
# def fromflat(cls, flat):
# return cls(flat)
# def flatparam(self):
# return self.logits
# def mode(self):
# return U.argmax(self.logits, axis=1)
# def logp(self, x):
# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
# def kl(self, other):
# return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
# - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def entropy(self):
# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def sample(self):
# u = tf.random_uniform(tf.shape(self.logits))
# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
def compute_loss(self, decoder_output, _features, labels):
"""Computes the loss for this model.
Returns a tuple `(losses, loss)`, where `losses` are the per-batch
losses and loss is a single scalar tensor to minimize.
"""
#pylint: disable=R0201
# Calculate loss per example-timestep of shape [B, T]
losses = seq2seq_losses.cross_entropy_sequence_loss(
logits=decoder_output.logits[:, :, :],
targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]),
sequence_length=labels["target_len"] - 1)
# Calculate the average log perplexity
loss = tf.reduce_sum(losses) / tf.to_float(
tf.reduce_sum(labels["target_len"] - 1))
return losses, loss
def encode(self, inputs):
inputs = tf.image.resize_images(
images=inputs,
size=[self.params["resize_height"], self.params["resize_width"]],
method=tf.image.ResizeMethod.BILINEAR)
outputs, _ = inception_v3_base(tf.to_float(inputs))
output_shape = outputs.get_shape() #pylint: disable=E1101
shape_list = output_shape.as_list()
# Take attentin over output elemnts in width and height dimension:
# Shape: [B, W*H, ...]
outputs_flat = tf.reshape(outputs, [shape_list[0], -1, shape_list[-1]])
# Final state is the pooled output
# Shape: [B, W*H*...]
final_state = tf.contrib.slim.avg_pool2d(
outputs, output_shape[1:3], padding="VALID", scope="pool")
final_state = tf.contrib.slim.flatten(outputs, scope="flatten")
return EncoderOutput(
outputs=outputs_flat,
final_state=final_state,
attention_values=outputs_flat,
attention_values_length=tf.shape(outputs_flat)[1])
def position_encoding(sentence_size, embedding_size):
"""
Position Encoding described in section 4.1 of
End-To-End Memory Networks (https://arxiv.org/abs/1503.08895).
Args:
sentence_size: length of the sentence
embedding_size: dimensionality of the embeddings
Returns:
A numpy array of shape [sentence_size, embedding_size] containing
the fixed position encodings for each sentence position.
"""
encoding = np.ones((sentence_size, embedding_size), dtype=np.float32)
ls = sentence_size + 1
le = embedding_size + 1
for k in range(1, le):
for j in range(1, ls):
encoding[j-1, k-1] = (1.0 - j/float(ls)) - (
k / float(le)) * (1. - 2. * j/float(ls))
return encoding
def _add_mh_correction(self, initial_position, initial_velocity, final_position, final_velocity):
""" Applies MH accept/reject correction. """
initial_energy = self._hamiltonian(initial_position, initial_velocity)
final_energy = self._hamiltonian(final_position, final_velocity)
accepted = self._metropolis_hastings_accept(initial_energy, final_energy)
accepted = tf.to_float(accepted)
# add acceptance to fetched values
self._accepted = accepted
if self.seek_step_sizes or self.fade_in_velocities:
burned_in = tf.to_float(self._burn_in_ratio == 1)
accepted = accepted * burned_in + tf.ones(shape=tf.shape(accepted)) * (1 - burned_in)
# apply MH decision
final_position = self._transpose_mul(final_position, accepted) + \
self._transpose_mul(initial_position, tf.ones(shape=tf.shape(accepted)) - accepted)
final_velocity = self._transpose_mul(final_velocity, accepted) + \
self._transpose_mul(-initial_velocity, tf.ones(shape=tf.shape(accepted)) - accepted)
return final_position, final_velocity
def _leapfrog_step(self, position, velocity, velocity_step_multiplier=1.):
""" Makes a single leapfrog step with friction. """
d_energy = self._d_energy_fn(position)
friction = self.friction
deceleration = -friction * self._transpose_mul(velocity, self._current_step_size)
velocity -= self._transpose_mul(d_energy, velocity_step_multiplier * self._current_step_size)
velocity += deceleration
# B_hat = 0, C = friction
noise = tf.random_normal(tf.shape(velocity))
stddevs = (2 * friction * self._current_step_size) ** 0.5
noise = self._transpose_mul(noise, stddevs)
velocity += noise
position = position + self._transpose_mul(velocity, self._current_step_size)
return position, velocity
def get_optimizer(self, learning_rate = 0.001):
with tf.name_scope('loss'):
input_shape = tf.shape(self.inputs)
ones = tf.ones([input_shape[0], input_shape[1]])
loss = tf.contrib.seq2seq.sequence_loss(self.logits, self.targets,
ones)
#-----------------------------------------------------------------------
# Build the optimizer
#-----------------------------------------------------------------------
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients = optimizer.compute_gradients(loss)
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) \
for grad, var in gradients if grad is not None]
optimizer_op = optimizer.apply_gradients(capped_gradients)
return optimizer_op, loss
def decode_jpeg(image_buffer, scope=None): # , dtype=tf.float32):
"""Decode a JPEG string into one 3-D float image Tensor.
Args:
image_buffer: scalar string Tensor.
scope: Optional scope for op_scope.
Returns:
3-D float Tensor with values ranging from [0, 1).
"""
# with tf.op_scope([image_buffer], scope, 'decode_jpeg'):
# with tf.name_scope(scope, 'decode_jpeg', [image_buffer]):
with tf.name_scope(scope or 'decode_jpeg'):
# Decode the string as an RGB JPEG.
# Note that the resulting image contains an unknown height and width
# that is set dynamically by decode_jpeg. In other words, the height
# and width of image is unknown at compile-time.
image = tf.image.decode_jpeg(image_buffer, channels=3,
fancy_upscaling=False,
dct_method='INTEGER_FAST')
# image = tf.Print(image, [tf.shape(image)], 'Image shape: ')
return image
def _crop_pool_layer(self, bottom, rois, name):
with tf.variable_scope(name) as scope:
batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1])
# Get the normalized coordinates of bboxes
bottom_shape = tf.shape(bottom)
height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0])
width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0])
x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width
y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height
x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width
y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height
# Won't be back-propagated to rois anyway, but to save time
bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], 1))
if cfg.RESNET.MAX_POOL:
pre_pool_size = cfg.POOLING_SIZE * 2
crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size],
name="crops")
crops = slim.max_pool2d(crops, [2, 2], padding='SAME')
else:
crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [cfg.POOLING_SIZE, cfg.POOLING_SIZE],
name="crops")
return crops
# Do the first few layers manually, because 'SAME' padding can behave inconsistently
# for images of different sizes: sometimes 0, sometimes 1
def _crop_pool_layer(self, bottom, rois, name):
with tf.variable_scope(name) as scope:
batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1])
# Get the normalized coordinates of bounding boxes
bottom_shape = tf.shape(bottom)
height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0])
width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0])
x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width
y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height
x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width
y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height
# Won't be back-propagated to rois anyway, but to save time
bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1))
pre_pool_size = cfg.POOLING_SIZE * 2
crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops")
return slim.max_pool2d(crops, [2, 2], padding='SAME')
def _anchor_component(self):
with tf.variable_scope('ANCHOR_' + self._tag) as scope:
# just to get the shape right
height = tf.to_int32(tf.ceil(self._im_info[0] / np.float32(self._feat_stride[0])))
width = tf.to_int32(tf.ceil(self._im_info[1] / np.float32(self._feat_stride[0])))
anchors, anchor_length = tf.py_func(generate_anchors_pre,
[height, width,
self._feat_stride, self._anchor_scales, self._anchor_ratios],
[tf.float32, tf.int32], name="generate_anchors")
anchors.set_shape([None, 4])
anchor_length.set_shape([])
self._anchors = anchors
self._anchor_length = anchor_length
# [Hand Detection] Batch normalization
# http://stackoverflow.com/a/34634291/2267819
# Note that this is different from the paper(they use another method)