def __init__(self, corpus=None, id2word=None, dictionary=None,
wlocal=utils.identity, wglobal=df2idf, normalize=True):
"""
Compute tf-idf by multiplying a local component (term frequency) with a
global component (inverse document frequency), and normalizing
the resulting documents to unit length. Formula for unnormalized weight
of term `i` in document `j` in a corpus of D documents::
weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i})
or, more generally::
weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D)
so you can plug in your own custom `wlocal` and `wglobal` functions.
Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...)
and default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the
formula above.
`normalize` dictates how the final transformed vectors will be normalized.
`normalize=True` means set to unit length (default); `False` means don't
normalize. You can also set `normalize` to your own function that accepts
and returns a sparse vector.
If `dictionary` is specified, it must be a `corpora.Dictionary` object
and it will be used to directly construct the inverse document frequency
mapping (then `corpus`, if specified, is ignored).
"""
self.normalize = normalize
self.id2word = id2word
self.wlocal, self.wglobal = wlocal, wglobal
self.num_docs, self.num_nnz, self.idfs = None, None, None
if dictionary is not None:
# user supplied a Dictionary object, which already contains all the
# statistics we need to construct the IDF mapping. we can skip the
# step that goes through the corpus (= an optimization).
if corpus is not None:
logger.warning("constructor received both corpus and explicit "
"inverse document frequencies; ignoring the corpus")
self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
self.dfs = dictionary.dfs.copy()
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
elif corpus is not None:
self.initialize(corpus)
else:
# NOTE: everything is left uninitialized; presumably the model will
# be initialized in some other way
pass
评论列表
文章目录