def __init__(self, root, fileids=DOC_PATTERN, tags=None,
word_tokenizer=WordPunctTokenizer(),
sent_tokenizer=nltk.data.LazyLoader(
'tokenizers/punkt/english.pickle'),
encoding='utf8', **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``CorpusReader`` constructor.
"""
# Add the default category pattern if not passed into the class.
if not any(key.startswith('cat_') for key in kwargs.keys()):
kwargs['cat_pattern'] = CAT_PATTERN
CategorizedCorpusReader.__init__(self, kwargs)
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._good_tags = tags or self.TAGS
评论列表
文章目录