def tfidf(x, vocab_size, smooth=True, name=None):
"""Maps the terms in x to their term frequency * inverse document frequency.
The inverse document frequency of a term is calculated as 1+
log((corpus size + 1) / (document frequency of term + 1)) by default.
Example usage:
example strings [["I", "like", "pie", "pie", "pie"], ["yum", "yum", "pie]]
in: SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
[1, 0], [1, 1], [1, 2]],
values=[1, 2, 0, 0, 0, 3, 3, 0])
out: SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
values=[1, 2, 0, 3, 0])
SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
values=[(1/5)*(log(3/2)+1), (1/5)*(log(3/2)+1), (1/5),
(1/3), (2/3)*(log(3/2)+1])
NOTE that the first doc's duplicate "pie" strings have been combined to
one output, as have the second doc's duplicate "yum" strings.
Args:
x: A `SparseTensor` representing int64 values (most likely that are the
result of calling string_to_int on a tokenized string).
vocab_size: An int - the count of vocab used to turn the string into int64s
including any OOV buckets.
smooth: A bool indicating if the inverse document frequency should be
smoothed. If True, which is the default, then the idf is calculated as
1 + log((corpus size + 1) / (document frequency of term + 1)).
Otherwise, the idf is
1 +log((corpus size) / (document frequency of term)), which could
result in a divizion by zero error.
name: (Optional) A name for this operation.
Returns:
Two `SparseTensor`s with indices [index_in_batch, index_in_bag_of_words].
The first has values vocab_index, which is taken from input `x`.
The second has values tfidf_weight.
"""
def _to_vocab_range(x):
"""Enforces that the vocab_ids in x are positive."""
return tf.SparseTensor(
indices=x.indices,
values=tf.mod(x.values, vocab_size),
dense_shape=x.dense_shape)
with tf.name_scope(name, 'tfidf'):
cleaned_input = _to_vocab_range(x)
term_frequencies = _to_term_frequency(cleaned_input, vocab_size)
count_docs_with_term_column = _count_docs_with_term(term_frequencies)
# Expand dims to get around the min_tensor_rank checks
sizes = tf.expand_dims(tf.shape(cleaned_input)[0], 0)
# [batch, vocab] - tfidf
tfidfs = _to_tfidf(term_frequencies,
analyzers.sum(count_docs_with_term_column,
reduce_instance_dims=False),
analyzers.sum(sizes),
smooth)
return _split_tfidfs_to_outputs(tfidfs)
评论列表
文章目录