def __init__(self, root, fileids=None,
word_tokenizer=TweetTokenizer(),
encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
smaller units, including but not limited to words.
"""
CorpusReader.__init__(self, root, fileids, encoding)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError("File {} is empty".format(path))
"""Check that all user-created corpus files are non-empty."""
self._word_tokenizer = word_tokenizer
评论列表
文章目录