def describe(self, fileids=None, categories=None):
"""
Performs a single pass of the corpus and returns a dictionary with a
variety of metrics concerning the state of the corpus.
"""
# Structures to perform counting.
counts = nltk.FreqDist()
tokens = nltk.FreqDist()
started = time.time()
# Perform single pass over paragraphs, tokenize and count
for para in self.paras(fileids, categories):
counts['paras'] += 1
for sent in self._sent_tokenizer.tokenize(para):
counts['sents'] += 1
for word in self._word_tokenizer.tokenize(sent):
counts['words'] += 1
tokens[word] += 1
# Compute the number of files and categories in the corpus
n_fileids = len(self._resolve(fileids, categories) or self.fileids())
n_topics = len(self.categories(self._resolve(fileids, categories)))
# Return data structure with information
return {
'files': n_fileids,
'topics': n_topics,
'paras': counts['paras'],
'sents': counts['sents'],
'words': counts['words'],
'vocab': len(tokens),
'lexdiv': float(counts['words']) / float(len(tokens)),
'ppdoc': float(counts['paras']) / float(n_fileids),
'sppar': float(counts['sents']) / float(counts['paras']),
'secs': time.time() - started,
}
评论列表
文章目录