def input_stream(record_path, scope=None):
"""
Input data stream
ARGS
`record_path`: tf records file path
RETURN
`streams`: data streams
"""
with tf.device('/cpu:0'):
with tf.variable_scope(scope or 'input_stream'):
reader = tf.TFRecordReader()
filename_queue = tf.train.string_input_producer([record_path], None)
_, record_value = reader.read(filename_queue)
features = tf.parse_single_example(record_value,
{
'image_jpeg': tf.FixedLenFeature([], tf.string),
'image_name': tf.FixedLenFeature([], tf.string),
'word_polygons': tf.VarLenFeature(tf.float32),
# 'words': tf.VarLenFeature(tf.string) // FIXME: problem with parsing words
})
# decode jpeg image
image = tf.cast(tf.image.decode_jpeg(features['image_jpeg'], channels=3), tf.float32)
# extract bounding polygons
word_polygons = tf.sparse_tensor_to_dense(features['word_polygons'])
word_polygons = tf.reshape(word_polygons, [-1, WORD_POLYGON_DIM])
# extract words
# words = tf.sparse_tensor_to_dense(features['words'])
# output streams
streams = {'image': image,
'image_name': features['image_name'],
'image_jpeg': features['image_jpeg'],
'word_polygons': word_polygons}
return streams
python类VarLenFeature()的实例源码
def __init__(self, context_keys_to_features, sequence_keys_to_features,
items_to_handlers):
"""Constructs the decoder.
Args:
keys_to_features: a dictionary from TF-Example keys to either
tf.VarLenFeature or tf.FixedLenFeature instances. See tensorflow's
parsing_ops.py.
items_to_handlers: a dictionary from items (strings) to ItemHandler
instances. Note that the ItemHandler's are provided the keys that they
use to return the final item Tensors.
"""
self._context_keys_to_features = context_keys_to_features
self._sequence_keys_to_features = sequence_keys_to_features
self._items_to_handlers = items_to_handlers
def testMakeOutputDictError(self):
schema = self.toSchema({'a': tf.VarLenFeature(tf.string)})
# SparseTensor that cannot be represented as VarLenFeature.
fetches = {
'a': tf.SparseTensorValue(indices=np.array([(0, 2), (0, 4), (0, 8)]),
values=np.array([10.0, 20.0, 30.0]),
dense_shape=(1, 20))
}
with self.assertRaisesRegexp(
ValueError, 'cannot be decoded by ListColumnRepresentation'):
_ = impl_helper.make_output_dict(schema, fetches)
# SparseTensor of invalid rank.
fetches = {
'a': tf.SparseTensorValue(
indices=np.array([(0, 0, 1), (0, 0, 2), (0, 0, 3)]),
values=np.array([10.0, 20.0, 30.0]),
dense_shape=(1, 10, 10))
}
with self.assertRaisesRegexp(
ValueError, 'cannot be decoded by ListColumnRepresentation'):
_ = impl_helper.make_output_dict(schema, fetches)
# SparseTensor with indices that are out of order.
fetches = {
'a': tf.SparseTensorValue(indices=np.array([(0, 2), (2, 4), (1, 8)]),
values=np.array([10.0, 20.0, 30.0]),
dense_shape=(3, 20))
}
with self.assertRaisesRegexp(
ValueError, 'Encountered out-of-order sparse index'):
_ = impl_helper.make_output_dict(schema, fetches)
def testRunPreprocessingFn(self):
schema = self.toSchema({
'dense_1': tf.FixedLenFeature((), tf.float32),
'dense_2': tf.FixedLenFeature((1, 2), tf.int64),
'var_len': tf.VarLenFeature(tf.string),
'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100)
})
def preprocessing_fn(inputs):
return {
'dense_out': mappers.scale_to_0_1(inputs['dense_1']),
'sparse_out': tf.sparse_reshape(inputs['sparse'], (1, 10)),
}
_, inputs, outputs = impl_helper.run_preprocessing_fn(
preprocessing_fn, schema)
# Verify that the input placeholders have the correct types.
expected_dtype_and_shape = {
'dense_1': (tf.float32, tf.TensorShape([None])),
'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])),
'var_len': (tf.string, tf.TensorShape([None, None])),
'sparse': (tf.float32, tf.TensorShape([None, None])),
'dense_out': (tf.float32, tf.TensorShape([None])),
'sparse_out': (tf.float32, tf.TensorShape([None, None])),
}
for key, tensor in itertools.chain(six.iteritems(inputs),
six.iteritems(outputs)):
dtype, shape = expected_dtype_and_shape[key]
self.assertEqual(tensor.dtype, dtype)
tensor.get_shape().assert_is_compatible_with(shape)
def as_feature_spec(self):
"""Returns a representation of this ColumnSchema as a feature spec.
A feature spec (for a specific column) is one of a FixedLenFeature,
SparseFeature or VarLenFeature.
Returns:
A representation of this ColumnSchema as a feature spec.
"""
return self.representation.as_feature_spec(self)
def as_feature_spec(self, column):
if column.domain.dtype not in _TF_EXAMPLE_ALLOWED_TYPES:
raise ValueError('tf.Example parser supports only types {}, so it is '
'invalid to generate a feature_spec with type '
'{}.'.format(
_TF_EXAMPLE_ALLOWED_TYPES,
repr(column.domain.dtype)))
return tf.VarLenFeature(column.domain.dtype)
def from_feature_spec(feature_spec):
"""Convert a feature_spec to a Schema.
Args:
feature_spec: a features specification in the format expected by
tf.parse_example(), i.e.
`{name: FixedLenFeature(...), name: VarLenFeature(...), ...'
Returns:
A Schema representing the provided set of columns.
"""
return Schema({
key: _from_parse_feature(parse_feature)
for key, parse_feature in six.iteritems(feature_spec)
})
def _from_parse_feature(parse_feature):
"""Convert a single feature spec to a ColumnSchema."""
# FixedLenFeature
if isinstance(parse_feature, tf.FixedLenFeature):
representation = FixedColumnRepresentation(parse_feature.default_value)
return ColumnSchema(parse_feature.dtype, parse_feature.shape,
representation)
# FixedLenSequenceFeature
if isinstance(parse_feature, tf.FixedLenSequenceFeature):
raise ValueError('DatasetSchema does not support '
'FixedLenSequenceFeature yet.')
# VarLenFeature
if isinstance(parse_feature, tf.VarLenFeature):
representation = ListColumnRepresentation()
return ColumnSchema(parse_feature.dtype, [None], representation)
# SparseFeature
if isinstance(parse_feature, tf.SparseFeature):
index_field = SparseIndexField(name=parse_feature.index_key,
is_sorted=parse_feature.already_sorted)
representation = SparseColumnRepresentation(
value_field_name=parse_feature.value_key,
index_fields=[index_field])
return ColumnSchema(parse_feature.dtype, [parse_feature.size],
representation)
raise ValueError('Cannot interpret feature spec {} with type {}'.format(
parse_feature, type(parse_feature)))
def infer_column_schema_from_tensor(tensor):
"""Infer a ColumnSchema from a tensor."""
if isinstance(tensor, tf.SparseTensor):
# For SparseTensor, there's insufficient information to distinguish between
# ListColumnRepresentation and SparseColumnRepresentation. So we just guess
# the former, and callers are expected to handle the latter case on their
# own (e.g. by requiring the user to provide the schema). This is a policy
# motivated by the prevalence of VarLenFeature in current tf.Learn code.
axes = [Axis(None)]
representation = ListColumnRepresentation()
else:
axes = _shape_to_axes(tensor.get_shape(),
remove_batch_dimension=True)
representation = FixedColumnRepresentation()
return ColumnSchema(tensor.dtype, axes, representation)
def _decode(message):
features = {
'key': tf.FixedLenFeature([], tf.int64),
'vector': tf.VarLenFeature(tf.int64)
}
parsed = tf.parse_single_example(
serialized=message,
features=features)
key = parsed['key']
vector = tf.sparse_tensor_to_dense(parsed['vector'])
return key, vector
def frame_example_2_np(seq_example_bytes,
max_quantized_value=2,
min_quantized_value=-2):
feature_names=['rgb','audio']
feature_sizes = [1024, 128]
with tf.Graph().as_default():
contexts, features = tf.parse_single_sequence_example(
seq_example_bytes,
context_features={"video_id": tf.FixedLenFeature(
[], tf.string),
"labels": tf.VarLenFeature(tf.int64)},
sequence_features={
feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string)
for feature_name in feature_names
})
decoded_features = { name: tf.reshape(
tf.cast(tf.decode_raw(features[name], tf.uint8), tf.float32),
[-1, size]) for name, size in zip(feature_names, feature_sizes)
}
feature_matrices = {
name: utils.Dequantize(decoded_features[name],
max_quantized_value, min_quantized_value) for name in feature_names}
with tf.Session() as sess:
vid = sess.run(contexts['video_id'])
labs = sess.run(contexts['labels'].values)
rgb = sess.run(feature_matrices['rgb'])
audio = sess.run(feature_matrices['audio'])
return vid, labs, rgb, audio
#%% Split frame level file into three video level files: all, 1st half, 2nd half.
def build_graph():
feature_names=['rgb','audio']
feature_sizes = [1024, 128]
max_quantized_value=2
min_quantized_value=-2
seq_example_bytes = tf.placeholder(tf.string)
contexts, features = tf.parse_single_sequence_example(
seq_example_bytes,
context_features={"video_id": tf.FixedLenFeature(
[], tf.string),
"labels": tf.VarLenFeature(tf.int64)},
sequence_features={
feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string)
for feature_name in feature_names
})
decoded_features = { name: tf.reshape(
tf.cast(tf.decode_raw(features[name], tf.uint8), tf.float32),
[-1, size]) for name, size in zip(feature_names, feature_sizes)
}
feature_matrices = {
name: utils.Dequantize(decoded_features[name],
max_quantized_value, min_quantized_value) for name in feature_names}
tf.add_to_collection("vid_tsr", contexts['video_id'])
tf.add_to_collection("labs_tsr", contexts['labels'].values)
tf.add_to_collection("rgb_tsr", feature_matrices['rgb'])
tf.add_to_collection("audio_tsr", feature_matrices['audio'])
tf.add_to_collection("seq_example_bytes", seq_example_bytes)
# with tf.Session() as sess:
# writer = tf.summary.FileWriter('./graphs', sess.graph)
def input_pipeline(file_pattern, mode, capacity=64):
keys_to_features = {
"inputs": tf.VarLenFeature(tf.int64),
"targets": tf.VarLenFeature(tf.int64)
}
items_to_handlers = {
"inputs": tfexample_decoder.Tensor("inputs"),
"targets": tfexample_decoder.Tensor("targets")
}
# Now the non-trivial case construction.
with tf.name_scope("examples_queue"):
training = (mode == "train")
# Read serialized examples using slim parallel_reader.
num_epochs = None if training else 1
data_files = parallel_reader.get_data_files(file_pattern)
num_readers = min(4 if training else 1, len(data_files))
_, examples = parallel_reader.parallel_read([file_pattern],
tf.TFRecordReader,
num_epochs=num_epochs,
shuffle=training,
capacity=2 * capacity,
min_after_dequeue=capacity,
num_readers=num_readers)
decoder = tfexample_decoder.TFExampleDecoder(keys_to_features,
items_to_handlers)
decoded = decoder.decode(examples, items=list(items_to_handlers))
examples = {}
for (field, tensor) in zip(keys_to_features, decoded):
examples[field] = tensor
# We do not want int64s as they do are not supported on GPUs.
return {k: tf.to_int32(v) for (k, v) in six.iteritems(examples)}
def read_and_decode_single_example(filename_queue):
# Unlike the TFRecordWriter, the TFRecordReader is symbolic
reader = tf.TFRecordReader()
# One can read a single serialized example from a filename
# serialized_example is a Tensor of type string.
_, serialized_example = reader.read(filename_queue)
# The serialized example is converted back to actual values.
# One needs to describe the format of the objects to be returned
features = tf.parse_single_example(
serialized_example,
features={
# We know the length of both fields. If not the
# tf.VarLenFeature could be used
'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
'image/format': tf.FixedLenFeature((), tf.string, default_value='png'),
'image/class/label': tf.FixedLenFeature(
[], tf.int64, default_value=tf.zeros([], dtype=tf.int64))
})
# now return the converted data
label = features['image/class/label']
image = features['image/encoded']
# image = tf.image.decode_jpeg(image, channels=3)
image_format = features['image/format']
return label, image, image_format
def example_reading_spec(self):
data_fields = {
"inputs": tf.VarLenFeature(tf.int64),
"targets": tf.VarLenFeature(tf.int64),
"floats": tf.VarLenFeature(tf.float32),
}
data_items_to_decoders = None
return (data_fields, data_items_to_decoders)
def example_reading_spec(self):
data_fields = {
"inputs": tf.VarLenFeature(tf.int64),
"targets": tf.VarLenFeature(tf.int64),
"floats": tf.VarLenFeature(tf.float32),
}
data_items_to_decoders = None
return (data_fields, data_items_to_decoders)
def example_reading_spec(self):
data_fields = {
"inputs": tf.VarLenFeature(tf.int64),
"targets": tf.VarLenFeature(tf.int64)
}
data_items_to_decoders = None
return (data_fields, data_items_to_decoders)
tfrecord_read.py 文件源码
项目:Youtube8mdataset_kagglechallenge
作者: jasonlee27
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def prepare_reader(self,
filename_queue,
max_quantized_value=2,
min_quantized_value=-2):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
context_features, sequence_features = {"video_id": tf.FixedLenFeature([], tf.string),
"labels": tf.VarLenFeature(tf.int64)}, None
if self.sequence_data:
sequence_features = {self.feature_name[0]: tf.FixedLenSequenceFeature([], dtype=tf.string),
self.feature_name[1]: tf.FixedLenSequenceFeature([], dtype=tf.string), }
else:
context_features[self.feature_name[0]] = tf.FixedLenFeature(self.feature_size[0], tf.float32)
context_features[self.feature_name[1]] = tf.FixedLenFeature(self.feature_size[1], tf.float32)
contexts, features = tf.parse_single_sequence_example(serialized_example,
context_features=context_features,
sequence_features=sequence_features)
labels = (tf.cast(contexts["labels"].values, tf.int64))
if self.sequence_data:
decoded_features = tf.reshape(tf.cast(tf.decode_raw(features[self.feature_name[0]], tf.uint8), tf.float32),
[-1, self.feature_size[0]])
video_matrix = Dequantize(decoded_features, max_quantized_value, min_quantized_value)
decoded_features = tf.reshape(tf.cast(tf.decode_raw(features[self.feature_name[1]], tf.uint8), tf.float32),
[-1, self.feature_size[1]])
audio_matrix = Dequantize(decoded_features, max_quantized_value, min_quantized_value)
num_frames = tf.minimum(tf.shape(decoded_features)[0], self.max_frames)
else:
video_matrix = contexts[self.feature_name[0]]
audio_matrix = contexts[self.feature_name[1]]
num_frames = tf.constant(-1)
# Pad or truncate to 'max_frames' frames.
# video_matrix = resize_axis(video_matrix, 0, self.max_frames)
return contexts["video_id"], video_matrix, audio_matrix, labels, num_frames
def testFromCSVWithFeatureSpec(self):
if not HAS_PANDAS:
return
num_batches = 100
batch_size = 8
data_path = _make_test_csv_sparse()
feature_spec = {
"int": tf.FixedLenFeature(None, dtypes.int16, np.nan),
"float": tf.VarLenFeature(dtypes.float16),
"bool": tf.VarLenFeature(dtypes.bool),
"string": tf.FixedLenFeature(None, dtypes.string, "")
}
pandas_df = pd.read_csv(data_path, dtype={"string": object})
# Pandas insanely uses NaN for empty cells in a string column.
# And, we can't use Pandas replace() to fix them because nan != nan
s = pandas_df["string"]
for i in range(0, len(s)):
if isinstance(s[i], float) and math.isnan(s[i]):
pandas_df.set_value(i, "string", "")
tensorflow_df = df.TensorFlowDataFrame.from_csv_with_feature_spec(
[data_path],
batch_size=batch_size,
shuffle=False,
feature_spec=feature_spec)
# These columns were sparse; re-densify them for comparison
tensorflow_df["float"] = densify.Densify(np.nan)(tensorflow_df["float"])
tensorflow_df["bool"] = densify.Densify(np.nan)(tensorflow_df["bool"])
self._assert_pandas_equals_tensorflow(pandas_df,
tensorflow_df,
num_batches=num_batches,
batch_size=batch_size)
def testFromCSVWithFeatureSpec(self):
if not HAS_PANDAS:
return
num_batches = 100
batch_size = 8
data_path = _make_test_csv_sparse()
feature_spec = {
"int": tf.FixedLenFeature(None, dtypes.int16, np.nan),
"float": tf.VarLenFeature(dtypes.float16),
"bool": tf.VarLenFeature(dtypes.bool),
"string": tf.FixedLenFeature(None, dtypes.string, "")
}
pandas_df = pd.read_csv(data_path, dtype={"string": object})
# Pandas insanely uses NaN for empty cells in a string column.
# And, we can't use Pandas replace() to fix them because nan != nan
s = pandas_df["string"]
for i in range(0, len(s)):
if isinstance(s[i], float) and math.isnan(s[i]):
pandas_df.set_value(i, "string", "")
tensorflow_df = df.TensorFlowDataFrame.from_csv_with_feature_spec(
[data_path],
batch_size=batch_size,
shuffle=False,
feature_spec=feature_spec)
# These columns were sparse; re-densify them for comparison
tensorflow_df["float"] = densify.Densify(np.nan)(tensorflow_df["float"])
tensorflow_df["bool"] = densify.Densify(np.nan)(tensorflow_df["bool"])
self._assert_pandas_equals_tensorflow(pandas_df,
tensorflow_df,
num_batches=num_batches,
batch_size=batch_size)