def bucketize(x, num_buckets, epsilon=None, name=None):
"""Returns a bucketized column, with a bucket index assigned to each input.
Args:
x: A numeric input `Tensor` whose values should be mapped to buckets.
num_buckets: Values in the input `x` are divided into approximately
equal-sized buckets, where the number of buckets is num_buckets.
epsilon: (Optional) Error tolerance, typically a small fraction close to
zero. If a value is not specified by the caller, a suitable value is
computed based on experimental results. For `num_buckets` less than 100,
the value of 0.01 is chosen to handle a dataset of up to ~1 trillion input
data values. If `num_buckets` is larger, then epsilon is set to
(1/`num_buckets`) to enforce a stricter error tolerance, because more
buckets will result in smaller range for each bucket, and so we want the
the boundaries to be less fuzzy.
See analyzers.quantiles() for details.
name: (Optional) A name for this operation.
Returns:
A `Tensor` of the same shape as `x`, with each element in the
returned tensor representing the bucketized value. Bucketized value is
in the range [0, num_buckets).
Raises:
ValueError: If value of num_buckets is not > 1.
"""
with tf.name_scope(name, 'bucketize'):
if not isinstance(num_buckets, int):
raise TypeError('num_buckets must be an int, got %s', type(num_buckets))
if num_buckets < 1:
raise ValueError('Invalid num_buckets %d', num_buckets)
if epsilon is None:
# See explanation in args documentation for epsilon.
epsilon = min(1.0 / num_buckets, 0.01)
bucket_boundaries = analyzers.quantiles(x, num_buckets, epsilon)
buckets = quantile_ops.bucketize_with_input_boundaries(
x,
boundaries=bucket_boundaries,
name='assign_buckets')
# Convert to int64 because int32 is not compatible with tf.Example parser.
# See _TF_EXAMPLE_ALLOWED_TYPES in FixedColumnRepresentation()
# in tf_metadata/dataset_schema.py
return tf.to_int64(buckets)
评论列表
文章目录