def words(self, fileids=None, categories=None):
"""
Uses the built in word tokenizer to extract tokens from sentences.
Note that this method uses BeautifulSoup to parse HTML content.
"""
for sentence in self.sents(fileids, categories):
for token in self._word_tokenizer.tokenize(sentence):
yield token
评论列表
文章目录