def collection_stats():
# list of documents
documents_stat = reuters.fileids()
print(str(len(documents)) + " documents")
train_docs_stat = list(filter(lambda doc: doc.startswith("train"), documents_stat))
print(str(len(train_docs_stat)) + " total training documents")
test_docs_stat = list(filter(lambda doc: doc.startswith("test"), documents_stat))
print(str(len(test_docs_stat) + " total test documents"))
# list of categories
categories = reuters.categories()
print(str(len(categories)) + " categories")
# get the documents in a category
category_docs = reuters.fileids("acq")
# words for a document
document_id = category_docs[0]
document_words = reuters.words(category_docs[0])
print(document_words)
# print the raw document
print(reuters.raw(document_id))
评论列表
文章目录