def _to_term_frequency(x, vocab_size):
"""Creates a SparseTensor of term frequency for every doc/term pair.
Args:
x : a SparseTensor of int64 representing string indices in vocab.
vocab_size: An int - the count of vocab used to turn the string into int64s
including any OOV buckets.
Returns:
a SparseTensor with the count of times a term appears in a document at
indices <doc_index_in_batch>, <term_index_in_vocab>,
with size (num_docs_in_batch, vocab_size).
"""
# Construct intermediary sparse tensor with indices
# [<doc>, <term_index_in_doc>, <vocab_id>] and tf.ones values.
split_indices = tf.to_int64(
tf.split(x.indices, axis=1, num_or_size_splits=2))
expanded_values = tf.to_int64(tf.expand_dims(x.values, 1))
next_index = tf.concat(
[split_indices[0], split_indices[1], expanded_values], axis=1)
next_values = tf.ones_like(x.values)
vocab_size_as_tensor = tf.constant([vocab_size], dtype=tf.int64)
next_shape = tf.concat(
[x.dense_shape, vocab_size_as_tensor], 0)
next_tensor = tf.SparseTensor(
indices=tf.to_int64(next_index),
values=next_values,
dense_shape=next_shape)
# Take the intermediary tensor and reduce over the term_index_in_doc
# dimension. This produces a tensor with indices [<doc_id>, <term_id>]
# and values [count_of_term_in_doc] and shape batch x vocab_size
term_count_per_doc = tf.sparse_reduce_sum_sparse(next_tensor, 1)
dense_doc_sizes = tf.to_double(tf.sparse_reduce_sum(tf.SparseTensor(
indices=x.indices,
values=tf.ones_like(x.values),
dense_shape=x.dense_shape), 1))
gather_indices = term_count_per_doc.indices[:, 0]
gathered_doc_sizes = tf.gather(dense_doc_sizes, gather_indices)
term_frequency = (tf.to_double(term_count_per_doc.values) /
tf.to_double(gathered_doc_sizes))
return tf.SparseTensor(
indices=term_count_per_doc.indices,
values=term_frequency,
dense_shape=term_count_per_doc.dense_shape)
评论列表
文章目录