def _preprocess(self, text):
""" Return a list of lists. Each list is a preprocessed sentence of
text in bag-of-words format."""
stemmer = PorterStemmer()
self._sents = sent_tokenize(text)
# tokenize sentences
word_sents = [word_tokenize(sent.lower()) for sent in self._sents]
# remove stop-words and stem words
word_sents = [[stemmer.stem(word) for word in sent if
word not in self._stopwords] for sent in word_sents]
return word_sents
评论列表
文章目录