hashdictionary.py 文件源码-python代码片段

hashdictionary.py 文件源码

python

阅读 29 收藏 0 点赞 0 评论 0

项目：topical_word_embeddings 作者: thunlp 项目源码文件源码

def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True):
        """
        By default, keep track of debug statistics and mappings. If you find yourself
        running out of memory (or are sure you don't need the debug info), set
        `debug=False`.
        """
        self.myhash = myhash # hash fnc: string->integer
        self.id_range = id_range # hash range: id = myhash(key) % id_range
        self.debug = debug

        # the following (potentially massive!) dictionaries are only formed if `debug` is True
        self.token2id = {}
        self.id2token = {} # reverse mapping int->set(words)
        self.dfs = {} # token_id -> how many documents this token_id appeared in
        self.dfs_debug = {} # token_string->how many documents this word appeared in

        self.num_docs = 0 # number of documents processed
        self.num_pos = 0 # total number of corpus positions
        self.num_nnz = 0 # total number of non-zeroes in the BOW matrix
        self.allow_update = True

        if documents is not None:
            self.add_documents(documents)