def testNGramsWithSpaceSeparator(self):
string_tensor = tf.constant(['One was Johnny', 'Two was a rat'])
tokenized_tensor = tf.string_split(string_tensor, delimiter=' ')
output_tensor = mappers.ngrams(
tokens=tokenized_tensor,
ngram_range=(1, 2),
separator=' ')
with tf.Session():
output = output_tensor.eval()
self.assertAllEqual(
output.indices,
[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
self.assertAllEqual(output.values, [
'One', 'One was', 'was', 'was Johnny', 'Johnny',
'Two', 'Two was', 'was', 'was a', 'a', 'a rat', 'rat'])
self.assertAllEqual(output.dense_shape, [2, 7])
python类string_split()的实例源码
def full_onehot_process_line_as_2d_input(the_str, num_samples=-1):
with tf.name_scope("process_data_2d"):
#with tf.device("/cpu:0"):
# A tensor referenced when getting indices of characters for the the_values array
mapping_strings = tf.constant(
["0", "1", "K", "Q", "R", "B", "N", "P", "C", "k", "q", "r", "b", "n", "p", "c"])
number_of_mapping_strings = 16 # len(mapping_strings)
the_values = tf.constant(
[[1 if i == j else 0 for i in range(number_of_mapping_strings)] for j in range(number_of_mapping_strings)],
dtype=tf.float32)
# Create the table for getting indices (for the_values) from the information about the board
the_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings, name="index_lookup_table")
data = tf.reshape(
# Get the values at the given indices
tf.gather(
the_values,
# Get an array of indices corresponding to the array of characters
the_table.lookup(
# Split the string into an array of characters
tf.string_split(
[the_str],
delimiter="").values)),
[num_samples, 64, number_of_mapping_strings]) #THIS SHOULD REALLY BE [3x8x8,num_mapping_strings]
return data
def decode(self, data, items):
decoded_items = {}
# Split tokens
tokens = tf.string_split([data], delimiter=self.delimiter).values
# Optionally prepend a special token
if self.prepend_token is not None:
tokens = tf.concat([[self.prepend_token], tokens], 0)
# Optionally append a special token
if self.append_token is not None:
tokens = tf.concat([tokens, [self.append_token]], 0)
decoded_items[self.length_feature_name] = tf.size(tokens)
decoded_items[self.tokens_feature_name] = tokens
return [decoded_items[_] for _ in items]
def image_reading(path: str, resized_size: Tuple[int, int]=None, data_augmentation: bool=False,
padding: bool=False) -> Tuple[tf.Tensor, tf.Tensor]:
# Read image
image_content = tf.read_file(path, name='image_reader')
image = tf.cond(tf.equal(tf.string_split([path], '.').values[1], tf.constant('jpg', dtype=tf.string)),
true_fn=lambda: tf.image.decode_jpeg(image_content, channels=1, try_recover_truncated=True), # TODO channels = 3 ?
false_fn=lambda: tf.image.decode_png(image_content, channels=1), name='image_decoding')
# Data augmentation
if data_augmentation:
image = augment_data(image)
# Padding
if padding:
with tf.name_scope('padding'):
image, img_width = padding_inputs_width(image, resized_size, increment=CONST.DIMENSION_REDUCTION_W_POOLING)
# Resize
else:
image = tf.image.resize_images(image, size=resized_size)
img_width = tf.shape(image)[1]
with tf.control_dependencies([tf.assert_equal(image.shape[:2], resized_size)]):
return image, img_width
def decode(self, data, items):
decoded_items = {}
# Split tokens
tokens = tf.string_split([data], delimiter=self.delimiter).values
# Optionally prepend a special token
if self.prepend_token is not None:
tokens = tf.concat([[self.prepend_token], tokens], 0)
# Optionally append a special token
if self.append_token is not None:
tokens = tf.concat([tokens, [self.append_token]], 0)
decoded_items[self.length_feature_name] = tf.size(tokens)
decoded_items[self.tokens_feature_name] = tokens
return [decoded_items[_] for _ in items]
def testTFIDFNoData(self):
def preprocessing_fn(inputs):
inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']))
out_index, out_values = tft.tfidf(inputs_as_ints, 6)
return {
'tf_idf': out_values,
'index': out_index
}
input_data = [{'a': ''}]
input_schema = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
expected_transformed_data = [{'tf_idf': [], 'index': []}]
expected_transformed_schema = dataset_metadata.DatasetMetadata({
'tf_idf': sch.ColumnSchema(tf.float32, [None],
sch.ListColumnRepresentation()),
'index': sch.ColumnSchema(tf.int64, [None],
sch.ListColumnRepresentation())
})
self.assertAnalyzeAndTransformResults(
input_data, input_schema, preprocessing_fn, expected_transformed_data,
expected_transformed_schema)
def testUniquesAnalyzerWithTokenization(self):
def preprocessing_fn(inputs):
return {
'index': tft.string_to_int(tf.string_split(inputs['a']))
}
input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}]
input_metadata = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
expected_data = [{'index': [0, 0, 1]}, {'index': [0, 2, 1]}]
expected_metadata = dataset_metadata.DatasetMetadata({
'index': sch.ColumnSchema(
sch.IntDomain(tf.int64, -1, 2, True,
'vocab_string_to_int_uniques'),
[None], sch.ListColumnRepresentation())
})
self.assertAnalyzeAndTransformResults(
input_data, input_metadata, preprocessing_fn, expected_data,
expected_metadata)
def decode(self, data, items):
decoded_items = {}
# Split tokens
tokens = tf.string_split([data], delimiter=self.delimiter).values
# Optionally prepend a special token
if self.prepend_token is not None:
tokens = tf.concat([[self.prepend_token], tokens], 0)
# Optionally append a special token
if self.append_token is not None:
tokens = tf.concat([tokens, [self.append_token]], 0)
decoded_items[self.length_feature_name] = tf.size(tokens)
decoded_items[self.tokens_feature_name] = tokens
return [decoded_items[_] for _ in items]
def decode(self, data, items):
decoded_items = {}
# Split tokens
tokens = tf.string_split([data], delimiter=self.delimiter).values
# Optionally prepend a special token
if self.prepend_token is not None:
tokens = tf.concat([[self.prepend_token], tokens], 0)
# Optionally append a special token
if self.append_token is not None:
tokens = tf.concat([tokens, [self.append_token]], 0)
decoded_items[self.length_feature_name] = tf.size(tokens)
decoded_items[self.tokens_feature_name] = tokens
return [decoded_items[_] for _ in items]
def make_preprocessing_fn(frequency_threshold):
"""Creates a preprocessing function for reddit.
Args:
frequency_threshold: The frequency_threshold used when generating
vocabularies for categorical and text features.
Returns:
A preprocessing function.
"""
def preprocessing_fn(inputs):
"""User defined preprocessing function for reddit columns.
Args:
inputs: dictionary of input `tensorflow_transform.Column`.
Returns:
A dictionary of `tensorflow_transform.Column` representing the transformed
columns.
"""
# TODO(b/35001605) Make this "passthrough" more DRY.
result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}
result['subreddit_id'] = tft.string_to_int(
inputs['subreddit'], frequency_threshold=frequency_threshold)
for name in ('author', 'comment_body', 'comment_parent_body'):
words = tf.string_split(inputs[name])
# TODO(b/33467613) Translate these to bag-of-words style sparse features.
result[name + '_bow'] = tft.string_to_int(
words, frequency_threshold=frequency_threshold)
return result
return preprocessing_fn
def __init__(self, config, batch_size, one_hot=False):
self.lookup = None
reader = tf.TextLineReader()
filename_queue = tf.train.string_input_producer(["chargan.txt"])
key, x = reader.read(filename_queue)
vocabulary = self.get_vocabulary()
table = tf.contrib.lookup.string_to_index_table_from_tensor(
mapping = vocabulary, default_value = 0)
x = tf.string_join([x, tf.constant(" " * 64)])
x = tf.substr(x, [0], [64])
x = tf.string_split(x,delimiter='')
x = tf.sparse_tensor_to_dense(x, default_value=' ')
x = tf.reshape(x, [64])
x = table.lookup(x)
self.one_hot = one_hot
if one_hot:
x = tf.one_hot(x, len(vocabulary))
x = tf.cast(x, dtype=tf.float32)
x = tf.reshape(x, [1, int(x.get_shape()[0]), int(x.get_shape()[1]), 1])
else:
x = tf.cast(x, dtype=tf.float32)
x -= len(vocabulary)/2.0
x /= len(vocabulary)/2.0
x = tf.reshape(x, [1,1, 64, 1])
num_preprocess_threads = 8
x = tf.train.shuffle_batch(
[x],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity= 5000,
min_after_dequeue=500,
enqueue_many=True)
self.x = x
self.table = table
def testStringToTFIDFEmptyDoc(self):
def preprocessing_fn(inputs):
inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']))
out_index, out_values = tft.tfidf(inputs_as_ints, 6)
return {
'tf_idf': out_values,
'index': out_index
}
input_data = [{'a': 'hello hello world'},
{'a': ''},
{'a': 'hello goodbye hello world'},
{'a': 'I like pie pie pie'}]
input_schema = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
log_5_over_2 = 1.91629073187
log_5_over_3 = 1.51082562376
expected_transformed_data = [{
'tf_idf': [(2/3)*log_5_over_3, (1/3)*log_5_over_3],
'index': [0, 2]
}, {
'tf_idf': [],
'index': []
}, {
'tf_idf': [(2/4)*log_5_over_3, (1/4)*log_5_over_3, (1/4)*log_5_over_2],
'index': [0, 2, 4]
}, {
'tf_idf': [(3/5)*log_5_over_2, (1/5)*log_5_over_2, (1/5)*log_5_over_2],
'index': [1, 3, 5]
}]
expected_transformed_schema = dataset_metadata.DatasetMetadata({
'tf_idf': sch.ColumnSchema(tf.float32, [None],
sch.ListColumnRepresentation()),
'index': sch.ColumnSchema(tf.int64, [None],
sch.ListColumnRepresentation())
})
self.assertAnalyzeAndTransformResults(
input_data, input_schema, preprocessing_fn, expected_transformed_data,
expected_transformed_schema)
def testUniquesAnalyzerWithHighFrequencyThresholdAndOOVBuckets(self):
def preprocessing_fn(inputs):
return {
'index1':
tft.string_to_int(
tf.string_split(inputs['a']),
default_value=-99,
top_k=1,
num_oov_buckets=3)
}
input_data = [
{'a': 'hello hello world world'},
{'a': 'hello tarkus toccata'},
{'a': 'hello goodbye foo'}
]
input_metadata = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
# Generated vocab (ordered by frequency, then value) should be:
# ["hello", "world", "goodbye", "foo", "tarkus", "toccata"]. After applying
# top_k =1 this becomes ["hello"] plus three OOV buckets.
# The specific output values here depend on the hash of the words, and the
# test will break if the hash changes.
expected_data = [
{'index1': [0, 0, 2, 2]},
{'index1': [0, 3, 1]},
{'index1': [0, 2, 1]},
]
expected_metadata = dataset_metadata.DatasetMetadata({
'index1': sch.ColumnSchema(
sch.IntDomain(tf.int64, 0, 3, True,
'vocab_string_to_int_uniques'), [None],
sch.ListColumnRepresentation()),
})
self.assertAnalyzeAndTransformResults(
input_data, input_metadata, preprocessing_fn, expected_data,
expected_metadata)
def testNGramsEmpty(self):
output_tensor = mappers.ngrams(tf.string_split(tf.constant([''])),
(1, 5), '')
with tf.Session():
output = output_tensor.eval()
self.assertEqual((0, 2), output.indices.shape)
self.assertAllEqual([1, 0], output.dense_shape)
self.assertEqual(0, len(output.values))
def testNGrams(self):
string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
tokenized_tensor = tf.string_split(string_tensor, delimiter='')
output_tensor = mappers.ngrams(
tokens=tokenized_tensor,
ngram_range=(1, 5),
separator='')
self.assertSparseOutput(
expected_indices=[
[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5],
[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],
[2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7],
[2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14],
[2, 15], [2, 16], [2, 17], [2, 18], [2, 19], [2, 20], [2, 21],
[2, 22], [2, 23], [2, 24], [2, 25], [2, 26], [2, 27], [2, 28],
[2, 29], [3, 0]],
expected_values=[
'a', 'ab', 'abc', 'b', 'bc', 'c',
'd', 'de', 'def', 'e', 'ef', 'f',
'f', 'fg', 'fgh', 'fghi', 'fghij', 'g', 'gh', 'ghi', 'ghij',
'ghijk', 'h', 'hi', 'hij', 'hijk', 'hijkl', 'i', 'ij', 'ijk',
'ijkl', 'ijklm', 'j', 'jk', 'jkl', 'jklm', 'k', 'kl', 'klm', 'l',
'lm', 'm', 'z'],
expected_shape=[5, 30],
actual_sparse_tensor=output_tensor,
close_values=False)
def testNGramsBadSizes(self):
string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
tokenized_tensor = tf.string_split(string_tensor, delimiter='')
with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'):
mappers.ngrams(tokenized_tensor, (0, 5), separator='')
with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'):
mappers.ngrams(tokenized_tensor, (6, 5), separator='')
def get_inference_input(inputs, params):
dataset = tf.data.Dataset.from_tensor_slices(
tf.constant(inputs)
)
# Split string
dataset = dataset.map(lambda x: tf.string_split([x]).values,
num_parallel_calls=params.num_threads)
# Append <eos>
dataset = dataset.map(
lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
num_parallel_calls=params.num_threads
)
# Convert tuple to dictionary
dataset = dataset.map(
lambda x: {"source": x, "source_length": tf.shape(x)[0]},
num_parallel_calls=params.num_threads
)
dataset = dataset.padded_batch(
params.decode_batch_size,
{"source": [tf.Dimension(None)], "source_length": []},
{"source": params.pad, "source_length": 0}
)
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()
src_table = tf.contrib.lookup.index_table_from_tensor(
tf.constant(params.vocabulary["source"]),
default_value=params.mapping["source"][params.unk]
)
features["source"] = src_table.lookup(features["source"])
return features
def get_infer_iterator(
src_dataset, src_vocab_table, batch_size,
source_reverse, eos, src_max_len=None):
src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32)
src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)
if src_max_len:
src_dataset = src_dataset.map(lambda src: src[:src_max_len])
# Convert the word strings to ids
src_dataset = src_dataset.map(
lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))
if source_reverse:
src_dataset = src_dataset.map(lambda src: tf.reverse(src, axis=[0]))
# Add in the word counts.
src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))
def batching_func(x):
return x.padded_batch(
batch_size,
# The entry is the source line rows;
# this has unknown-length vectors. The last entry is
# the source row size; this is a scalar.
padded_shapes=(tf.TensorShape([None]), # src
tf.TensorShape([])), # src_len
# Pad the source sequences with eos tokens.
# (Though notice we don't generally need to do this since
# later on we will be masking out calculations past the true sequence.
padding_values=(src_eos_id, # src
0)) # src_len -- unused
batched_dataset = batching_func(src_dataset)
batched_iter = batched_dataset.make_initializable_iterator()
(src_ids, src_seq_len) = batched_iter.get_next()
return BatchedInput(
initializer=batched_iter.initializer,
source=src_ids,
target_input=None,
target_output=None,
source_sequence_length=src_seq_len,
target_sequence_length=None)
def get_input_fn(batch_size, num_epochs, context_filename, answer_filename, max_sequence_len):
def input_fn():
source_dataset = tf.contrib.data.TextLineDataset(context_filename)
target_dataset = tf.contrib.data.TextLineDataset(answer_filename)
def map_dataset(dataset):
dataset = dataset.map(lambda string: tf.string_split([string]).values)
dataset = dataset.map(lambda token: tf.string_to_number(token, tf.int64))
dataset = dataset.map(lambda tokens: (tokens, tf.size(tokens)))
dataset = dataset.map(lambda tokens, size: (tokens[:max_sequence_len], tf.minimum(size, max_sequence_len)))
return dataset
source_dataset = map_dataset(source_dataset)
target_dataset = map_dataset(target_dataset)
dataset = tf.contrib.data.Dataset.zip((source_dataset, target_dataset))
dataset = dataset.repeat(num_epochs)
dataset = dataset.padded_batch(batch_size,
padded_shapes=((tf.TensorShape([max_sequence_len]), tf.TensorShape([])),
(tf.TensorShape([max_sequence_len]), tf.TensorShape([]))
))
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
return next_element, None
return input_fn
def get_inference_input(inputs, params):
dataset = tf.data.Dataset.from_tensor_slices(
tf.constant(inputs)
)
# Split string
dataset = dataset.map(lambda x: tf.string_split([x]).values,
num_parallel_calls=params.num_threads)
# Append <eos>
dataset = dataset.map(
lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
num_parallel_calls=params.num_threads
)
# Convert tuple to dictionary
dataset = dataset.map(
lambda x: {"source": x, "source_length": tf.shape(x)[0]},
num_parallel_calls=params.num_threads
)
dataset = dataset.padded_batch(
params.decode_batch_size,
{"source": [tf.Dimension(None)], "source_length": []},
{"source": params.pad, "source_length": 0}
)
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()
src_table = tf.contrib.lookup.index_table_from_tensor(
tf.constant(params.vocabulary["source"]),
default_value=params.mapping["source"][params.unk]
)
features["source"] = src_table.lookup(features["source"])
return features
def get_test_iterator(src_dataset, src_vocab_table, batch_size, config):
src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(config.eos)), tf.int32)
src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)
src_dataset = src_dataset.map(lambda src: src[:config.src_max_len])
src_dataset = src_dataset.map(
lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))
if config.reverse_src:
src_dataset = src_dataset.map(lambda src: tf.reverse(src, axis=[0]))
src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))
def batching_func(x):
return x.padded_batch(
config.batch_size,
padded_shapes=(tf.TensorShape([None]),
tf.TensorShape([])),
padding_values=(src_eos_id,
0))
batched_dataset = batching_func(src_dataset)
batched_iter = batched_dataset.make_initializable_iterator()
src_ids, src_seq_len = batched_iter.get_next()
return BatchedInput(
initializer=batched_iter.initializer,
source=src_ids,
target_input=None,
target_output=None,
source_sequence_length=src_seq_len,
target_sequence_length=None)
def _read_id_file(path) -> Dataset:
def _parse_line(line):
splits = tf.string_split(tf.reshape(line, (-1,))).values
return tf.string_to_number(splits, out_type=tf.int32)
return TextLineDataset(path) \
.filter(lambda line: tf.size(line) > 0) \
.map(_parse_line)
def testStringToTFIDF(self):
def preprocessing_fn(inputs):
inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']))
out_index, out_values = tft.tfidf(inputs_as_ints, 6)
return {
'tf_idf': out_values,
'index': out_index
}
input_data = [{'a': 'hello hello world'},
{'a': 'hello goodbye hello world'},
{'a': 'I like pie pie pie'}]
input_schema = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
# IDFs
# hello = log(4/3) = 0.28768
# world = log(4/3)
# goodbye = log(4/2) = 0.69314
# I = log(4/2)
# like = log(4/2)
# pie = log(4/2)
log_4_over_2 = 1.69314718056
log_4_over_3 = 1.28768207245
expected_transformed_data = [{
'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3],
'index': [0, 2]
}, {
'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_2],
'index': [0, 2, 4]
}, {
'tf_idf': [(3/5)*log_4_over_2, (1/5)*log_4_over_2, (1/5)*log_4_over_2],
'index': [1, 3, 5]
}]
expected_transformed_schema = dataset_metadata.DatasetMetadata({
'tf_idf': sch.ColumnSchema(tf.float32, [None],
sch.ListColumnRepresentation()),
'index': sch.ColumnSchema(tf.int64, [None],
sch.ListColumnRepresentation())
})
self.assertAnalyzeAndTransformResults(
input_data, input_schema, preprocessing_fn, expected_transformed_data,
expected_transformed_schema)
def testTFIDFWithOOV(self):
test_vocab_size = 3
def preprocessing_fn(inputs):
inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']),
top_k=test_vocab_size)
out_index, out_values = tft.tfidf(inputs_as_ints,
test_vocab_size+1)
return {
'tf_idf': out_values,
'index': out_index
}
input_data = [{'a': 'hello hello world'},
{'a': 'hello goodbye hello world'},
{'a': 'I like pie pie pie'}]
input_schema = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
# IDFs
# hello = log(3/3) = 0
# pie = log(3/2) = 0.4054651081
# world = log(3/3) = 0
# OOV - goodbye, I, like = log(3/3)
log_4_over_2 = 1.69314718056
log_4_over_3 = 1.28768207245
expected_transformed_data = [{
'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3],
'index': [0, 2]
}, {
'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_3],
'index': [0, 2, 3]
}, {
'tf_idf': [(3/5)*log_4_over_2, (2/5)*log_4_over_3],
'index': [1, 3]
}]
expected_transformed_schema = dataset_metadata.DatasetMetadata({
'tf_idf': sch.ColumnSchema(tf.float32, [None],
sch.ListColumnRepresentation()),
'index': sch.ColumnSchema(tf.int64, [None],
sch.ListColumnRepresentation())
})
self.assertAnalyzeAndTransformResults(
input_data, input_schema, preprocessing_fn, expected_transformed_data,
expected_transformed_schema)
def testUniquesAnalyzerWithFrequencyThreshold(self):
def preprocessing_fn(inputs):
return {
'index1': tft.string_to_int(tf.string_split(inputs['a']),
default_value=-99, frequency_threshold=2),
# As above but using a string for frequency_threshold (and changing
# the default_value to showcase things).
'index2': tft.string_to_int(tf.string_split(inputs['a']),
default_value=-9, frequency_threshold='2')
}
input_data = [
{'a': 'hello hello world'},
{'a': 'hello goodbye world'},
{'a': 'hello goodbye foo'}
]
input_metadata = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
# Generated vocab (ordered by frequency, then value) should be:
# ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2
# this becomes
# ["hello", "world", "goodbye"].
expected_data = [
{'index1': [0, 0, 1], 'index2': [0, 0, 1]},
{'index1': [0, 2, 1], 'index2': [0, 2, 1]},
{'index1': [0, 2, -99], 'index2': [0, 2, -9]}
]
expected_metadata = dataset_metadata.DatasetMetadata({
'index1': sch.ColumnSchema(
sch.IntDomain(tf.int64, -99, 2, True,
'vocab_string_to_int_uniques'),
[None], sch.ListColumnRepresentation()),
'index2': sch.ColumnSchema(
sch.IntDomain(tf.int64, -9, 2, True,
'vocab_string_to_int_1_uniques'),
[None], sch.ListColumnRepresentation())
})
self.assertAnalyzeAndTransformResults(
input_data, input_metadata, preprocessing_fn, expected_data,
expected_metadata)
def testUniquesAnalyzerWithFrequencyThresholdTooHigh(self):
# Expected to return an empty dict due to too high threshold.
def preprocessing_fn(inputs):
return {
'index1':
tft.string_to_int(
tf.string_split(inputs['a']),
default_value=-99,
frequency_threshold=77),
# As above but using a string for frequency_threshold (and changing
# the default_value to showcase things).
'index2':
tft.string_to_int(
tf.string_split(inputs['a']),
default_value=-9,
frequency_threshold='77')
}
input_data = [
{'a': 'hello hello world'},
{'a': 'hello goodbye world'},
{'a': 'hello goodbye foo'}
]
input_metadata = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
# Generated vocab (ordered by frequency, then value) should be:
# ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2
# this becomes empty.
expected_data = [
{'index1': [-99, -99, -99], 'index2': [-9, -9, -9]},
{'index1': [-99, -99, -99], 'index2': [-9, -9, -9]},
{'index1': [-99, -99, -99], 'index2': [-9, -9, -9]}
]
# Note the vocabs are empty but the tables have size 1 so max_value is 1.
expected_metadata = dataset_metadata.DatasetMetadata({
'index1': sch.ColumnSchema(
sch.IntDomain(tf.int64, -99, 0, True,
'vocab_string_to_int_uniques'),
[None], sch.ListColumnRepresentation()),
'index2': sch.ColumnSchema(
sch.IntDomain(tf.int64, -9, 0, True,
'vocab_string_to_int_1_uniques'),
[None], sch.ListColumnRepresentation())
})
self.assertAnalyzeAndTransformResults(
input_data, input_metadata, preprocessing_fn, expected_data,
expected_metadata)
def get_evaluation_input(inputs, params):
with tf.device("/cpu:0"):
# Create datasets
datasets = []
for data in inputs:
dataset = tf.data.Dataset.from_tensor_slices(data)
# Split string
dataset = dataset.map(lambda x: tf.string_split([x]).values,
num_parallel_calls=params.num_threads)
# Append <eos>
dataset = dataset.map(
lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
num_parallel_calls=params.num_threads
)
datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets))
# Convert tuple to dictionary
dataset = dataset.map(
lambda *x: {
"source": x[0],
"source_length": tf.shape(x[0])[0],
"references": x[1:]
},
num_parallel_calls=params.num_threads
)
dataset = dataset.padded_batch(
params.eval_batch_size,
{
"source": [tf.Dimension(None)],
"source_length": [],
"references": (tf.Dimension(None),) * (len(inputs) - 1)
},
{
"source": params.pad,
"source_length": 0,
"references": (params.pad,) * (len(inputs) - 1)
}
)
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()
src_table = tf.contrib.lookup.index_table_from_tensor(
tf.constant(params.vocabulary["source"]),
default_value=params.mapping["source"][params.unk]
)
tgt_table = tf.contrib.lookup.index_table_from_tensor(
tf.constant(params.vocabulary["target"]),
default_value=params.mapping["target"][params.unk]
)
features["source"] = src_table.lookup(features["source"])
features["references"] = tuple(
tgt_table.lookup(item) for item in features["references"]
)
return features
def read_images(data_dir):
pattern = os.path.join(data_dir, '*.png')
filenames = tf.train.match_filenames_once(pattern, name='list_files')
queue = tf.train.string_input_producer(
filenames,
num_epochs=NUM_EPOCHS,
shuffle=True,
name='queue')
reader = tf.WholeFileReader()
filename, content = reader.read(queue, name='read_image')
filename = tf.Print(
filename,
data=[filename],
message='loading: ')
filename_split = tf.string_split([filename], delimiter='/')
label_id = tf.string_to_number(tf.substr(filename_split.values[1],
0, 1), out_type=tf.int32)
label = tf.one_hot(
label_id-1,
5,
on_value=1.0,
off_value=0.0,
dtype=tf.float32)
img_tensor = tf.image.decode_png(
content,
dtype=tf.uint8,
channels=3,
name='img_decode')
# Preprocess the image, Performs random transformations
# Random flip
img_tensor_flip = tf.image.random_flip_left_right(img_tensor)
# Random brightness
img_tensor_bri = tf.image.random_brightness(img_tensor_flip,
max_delta=0.2)
# Per-image scaling
img_tensor_std = tf.image.per_image_standardization(img_tensor_bri)
min_after_dequeue = 1000
capacity = min_after_dequeue + 3 * BATCH_SIZE
example_batch, label_batch = tf.train.shuffle_batch(
[img_tensor_std, label],
batch_size=BATCH_SIZE,
shapes=[(IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS), (NUM_CLASS)],
capacity=capacity,
min_after_dequeue=min_after_dequeue,
name='train_shuffle')
return example_batch, label_batch
# `images` is a 4-D tensor with the shape:
# [n_batch, img_height, img_width, n_channel]
def get_infer_iterator(src_dataset,
src_vocab_table,
batch_size,
eos,
src_max_len=None):
src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32)
src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)
if src_max_len:
src_dataset = src_dataset.map(lambda src: src[:src_max_len])
# Convert the word strings to ids
src_dataset = src_dataset.map(
lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))
# Add in the word counts.
src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))
def batching_func(x):
return x.padded_batch(
batch_size,
# The entry is the source line rows;
# this has unknown-length vectors. The last entry is
# the source row size; this is a scalar.
padded_shapes=(
tf.TensorShape([None]), # src
tf.TensorShape([])), # src_len
# Pad the source sequences with eos tokens.
# (Though notice we don't generally need to do this since
# later on we will be masking out calculations past the true sequence.
padding_values=(
src_eos_id, # src
0)) # src_len -- unused
batched_dataset = batching_func(src_dataset)
batched_iter = batched_dataset.make_initializable_iterator()
(src_ids, src_seq_len) = batched_iter.get_next()
return BatchedInput(
initializer=batched_iter.initializer,
source=src_ids,
target_input=None,
target_output=None,
source_sequence_length=src_seq_len,
target_sequence_length=None)
def process_line_as_2d_input_with_ep(the_str):
"""
NOTES:
1) I likely won't be using this, opting to instead use the onehot implementation
"""
with tf.name_scope("process_data_2d"):
# with tf.device("/cpu:0"):
# A tensor referenced when getting indices of characters for the the_values array
mapping_strings = tf.constant(["0", "1", "K", "Q", "R", "B", "N", "P", "C", "k", "q", "r", "b", "n", "p", "c"])
the_values = tf.constant(
[[0, 0, 0, 0, 0, 0, 0, 0], # 0
[0, 0, 0, 0, 0, 0, 1, 0], # 1
[1, 0, 0, 0, 0, 0, 0, 0], # K
[0, 1, 0, 0, 0, 0, 0, 0], # Q
[0, 0, 1, 0, 0, 0, 0, 0], # R
[0, 0, 0, 1, 0, 0, 0, 0], # B
[0, 0, 0, 0, 1, 0, 0, 0], # N
[0, 0, 0, 0, 0, 1, 0, 0], # P
[0, 0, 0, 0, 0, 0, 0, 1], # C
[-1, 0, 0, 0, 0, 0, 0, 0], # k
[0, -1, 0, 0, 0, 0, 0, 0], # q
[0, 0, -1, 0, 0, 0, 0, 0], # r
[0, 0, 0, -1, 0, 0, 0, 0], # b
[0, 0, 0, 0, -1, 0, 0, 0], # n
[0, 0, 0, 0, 0, -1, 0, 0], # p
[0, 0, 0, 0, 0, 0, 0, -1], # c
], dtype=tf.float32)
# Create the table for getting indices (for the_values) from the information about the board
the_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings, name="index_lookup_table")
data = tf.reshape(
# Get the values at the given indices
tf.gather(
the_values,
# Get an array of indices corresponding to the array of characters
the_table.lookup(
# Split the string into an array of characters
tf.string_split(
[the_str],
delimiter="").values)),
[3, 64, 8])
return data