def docs(self, fileids=None, categories=None):
"""
Returns the complete JSON document for every file in the corpus.
Note that I attempted to use the nltk ``CorpusView`` and ``concat``
methods here, but was not getting memory safe iteration. Instead the
simple Python generator by far did a better job of ensuring that file
handles got closed and that not all data was loaded into memory at a
time. In the future, I will try to re-implement the corpus view.
"""
# Resolve the fileids and the categories
fileids = self._resolve(fileids, categories)
# Create a generator, loading one document into memory at a time.
for path, enc, fileid in self.abspaths(fileids, True, True):
with codecs.open(path, 'r', encoding=enc) as f:
yield json.load(f)
评论列表
文章目录