def tokenize(text):
"""
Tokenizes sequences of text and stems the tokens.
:param text: String to tokenize
:return: List with stemmed tokens
"""
tokens = nltk.WhitespaceTokenizer().tokenize(text)
tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
tokens = [word for word in tokens if word not in stopwords.words('english')]
tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
stems = []
stemmer = SnowballStemmer("english")
for token in tokens:
token = stemmer.stem(token)
if token != "":
stems.append(token)
return stems
评论列表
文章目录