def tokenize(text):
min_length = 3
words = map(lambda word: word.lower(), word_tokenize(text))
words = [word for word in words if word not in cachedStopWords]
tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
p = re.compile('[a-zA-Z]+')
filtered_tokens = list(filter(lambda token: p.match(token) and len(token) >= min_length, tokens))
return filtered_tokens
评论列表
文章目录