def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True):
"""
By default, keep track of debug statistics and mappings. If you find yourself
running out of memory (or are sure you don't need the debug info), set
`debug=False`.
"""
self.myhash = myhash # hash fnc: string->integer
self.id_range = id_range # hash range: id = myhash(key) % id_range
self.debug = debug
# the following (potentially massive!) dictionaries are only formed if `debug` is True
self.token2id = {}
self.id2token = {} # reverse mapping int->set(words)
self.dfs = {} # token_id -> how many documents this token_id appeared in
self.dfs_debug = {} # token_string->how many documents this word appeared in
self.num_docs = 0 # number of documents processed
self.num_pos = 0 # total number of corpus positions
self.num_nnz = 0 # total number of non-zeroes in the BOW matrix
self.allow_update = True
if documents is not None:
self.add_documents(documents)
评论列表
文章目录