def getTokens(self, removeStopwords=True):
""" Tokenizes the text, breaking it up into words, removing punctuation. """
tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer.
spans = list(tokenizer.span_tokenize(self.text))
# Take note of how many spans there are in the text
self.length = spans[-1][-1]
tokens = tokenizer.tokenize(self.text)
tokens = [ token.lower() for token in tokens ] # make them lowercase
stemmer = LancasterStemmer()
tokens = [ stemmer.stem(token) for token in tokens ]
if not removeStopwords:
self.spans = spans
return tokens
tokenSpans = list(zip(tokens, spans)) # zip it up
stopwords = nltk.corpus.stopwords.words('english') # get stopwords
tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
return [ x[0] for x in tokenSpans ] # unzip; get tokens
评论列表
文章目录