def testUniquesAnalyzerWithHighFrequencyThresholdAndOOVBuckets(self):
def preprocessing_fn(inputs):
return {
'index1':
tft.string_to_int(
tf.string_split(inputs['a']),
default_value=-99,
top_k=1,
num_oov_buckets=3)
}
input_data = [
{'a': 'hello hello world world'},
{'a': 'hello tarkus toccata'},
{'a': 'hello goodbye foo'}
]
input_metadata = dataset_metadata.DatasetMetadata({
'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
})
# Generated vocab (ordered by frequency, then value) should be:
# ["hello", "world", "goodbye", "foo", "tarkus", "toccata"]. After applying
# top_k =1 this becomes ["hello"] plus three OOV buckets.
# The specific output values here depend on the hash of the words, and the
# test will break if the hash changes.
expected_data = [
{'index1': [0, 0, 2, 2]},
{'index1': [0, 3, 1]},
{'index1': [0, 2, 1]},
]
expected_metadata = dataset_metadata.DatasetMetadata({
'index1': sch.ColumnSchema(
sch.IntDomain(tf.int64, 0, 3, True,
'vocab_string_to_int_uniques'), [None],
sch.ListColumnRepresentation()),
})
self.assertAnalyzeAndTransformResults(
input_data, input_metadata, preprocessing_fn, expected_data,
expected_metadata)
评论列表
文章目录