hashdictionary.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码
def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True):
        """
        By default, keep track of debug statistics and mappings. If you find yourself
        running out of memory (or are sure you don't need the debug info), set
        `debug=False`.
        """
        self.myhash = myhash # hash fnc: string->integer
        self.id_range = id_range # hash range: id = myhash(key) % id_range
        self.debug = debug

        # the following (potentially massive!) dictionaries are only formed if `debug` is True
        self.token2id = {}
        self.id2token = {} # reverse mapping int->set(words)
        self.dfs = {} # token_id -> how many documents this token_id appeared in
        self.dfs_debug = {} # token_string->how many documents this word appeared in

        self.num_docs = 0 # number of documents processed
        self.num_pos = 0 # total number of corpus positions
        self.num_nnz = 0 # total number of non-zeroes in the BOW matrix
        self.allow_update = True

        if documents is not None:
            self.add_documents(documents)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号