def _to_tfidf(term_frequency, reduced_term_freq, corpus_size, smooth):
"""Calculates the inverse document frequency of terms in the corpus.
Args:
term_frequency: The `SparseTensor` output of _to_term_frequency.
reduced_term_freq: A `Tensor` of shape (vocabSize,) that represents the
count of the number of documents with each term.
corpus_size: A scalar count of the number of documents in the corpus.
smooth: A bool indicating if the idf value should be smoothed. See
tfidf_weights documentation for details.
Returns:
A `SparseTensor` with indices=<doc_index_in_batch>, <term_index_in_vocab>,
values=term frequency * inverse document frequency,
and shape=(batch, vocab_size)
"""
# The idf tensor has shape (vocab_size,)
if smooth:
idf = tf.log((tf.to_double(corpus_size) + 1.0) / (
1.0 + tf.to_double(reduced_term_freq))) + 1
else:
idf = tf.log(tf.to_double(corpus_size) / (
tf.to_double(reduced_term_freq))) + 1
gathered_idfs = tf.gather(tf.squeeze(idf), term_frequency.indices[:, 1])
tfidf_values = tf.to_float(term_frequency.values) * tf.to_float(gathered_idfs)
return tf.SparseTensor(
indices=term_frequency.indices,
values=tfidf_values,
dense_shape=term_frequency.dense_shape)
评论列表
文章目录