tfidfmodel.py 文件源码

python
阅读 27 收藏 0 点赞 0 评论 0

项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码
def __init__(self, corpus=None, id2word=None, dictionary=None,
                 wlocal=utils.identity, wglobal=df2idf, normalize=True):
        """
        Compute tf-idf by multiplying a local component (term frequency) with a
        global component (inverse document frequency), and normalizing
        the resulting documents to unit length. Formula for unnormalized weight
        of term `i` in document `j` in a corpus of D documents::

          weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i})

        or, more generally::

          weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D)

        so you can plug in your own custom `wlocal` and `wglobal` functions.

        Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...)
        and default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the
        formula above.

        `normalize` dictates how the final transformed vectors will be normalized.
        `normalize=True` means set to unit length (default); `False` means don't
        normalize. You can also set `normalize` to your own function that accepts
        and returns a sparse vector.

        If `dictionary` is specified, it must be a `corpora.Dictionary` object
        and it will be used to directly construct the inverse document frequency
        mapping (then `corpus`, if specified, is ignored).
        """
        self.normalize = normalize
        self.id2word = id2word
        self.wlocal, self.wglobal = wlocal, wglobal
        self.num_docs, self.num_nnz, self.idfs = None, None, None
        if dictionary is not None:
            # user supplied a Dictionary object, which already contains all the
            # statistics we need to construct the IDF mapping. we can skip the
            # step that goes through the corpus (= an optimization).
            if corpus is not None:
                logger.warning("constructor received both corpus and explicit "
                               "inverse document frequencies; ignoring the corpus")
            self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
            self.dfs = dictionary.dfs.copy()
            self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
        elif corpus is not None:
            self.initialize(corpus)
        else:
            # NOTE: everything is left uninitialized; presumably the model will
            # be initialized in some other way
            pass
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号