def make_preprocessing_fn(frequency_threshold):
"""Creates a preprocessing function for criteo.
Args:
frequency_threshold: The frequency_threshold used when generating
vocabularies for categorical and text features.
Returns:
A preprocessing function.
"""
def preprocessing_fn(inputs):
"""User defined preprocessing function for criteo columns.
Args:
inputs: dictionary of input `tensorflow_transform.Column`.
Returns:
A dictionary of `tensorflow_transform.Column` representing the transformed
columns.
"""
# TODO(b/35001605) Make this "passthrough" more DRY.
result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}
result['subreddit_id'] = tft.string_to_int(
inputs['subreddit'], frequency_threshold=frequency_threshold)
# TODO(b/35318962): Obviate the need for this workaround on Dense features.
# FeatureColumns expect shape (batch_size, 1), not just (batch_size)
# All features added to results up to this point are dense and require this
# workaround. All following features will be sparse.
result = {
k: tft.map(lambda x: tf.expand_dims(x, -1), v)
for k, v in result.items()
}
for name in ('author', 'comment_body', 'comment_parent_body'):
words = tft.map(tf.string_split, inputs[name])
# TODO(b/33467613) Translate these to bag-of-words style sparse features.
result[name + '_bow'] = tft.string_to_int(
words, frequency_threshold=frequency_threshold)
return result
return preprocessing_fn
评论列表
文章目录