def create_dictionary(self):
"""
Utility method to generate gensim-style Dictionary directly from
the corpus and vocabulary data.
"""
dictionary = Dictionary()
# replace dfs with defaultdict to avoid downstream KeyErrors
# uci vocabularies may contain terms that are not used in the document data
dictionary.dfs = defaultdict(int)
dictionary.id2token = self.id2word
dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))
dictionary.num_docs = self.num_docs
dictionary.num_nnz = self.num_nnz
for docno, doc in enumerate(self):
if docno % 10000 == 0:
logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))
for word, count in doc:
dictionary.dfs[word] += 1
dictionary.num_pos += count
return dictionary
评论列表
文章目录