def tokenizer(document):
"""
input: a string
output: a list of strings
converts a string into tokens and performs the following steps:
1. elimaintes non alphabetical characters
2. converts to lower case
3. lemmatizes using the nltk.stem.WordNetLemmatizer
4. splits into tokens
"""
text = re.sub('[^a-zA-Z]', ' ', document)
tokens = text.lower().split()
tokens = [lemmatizer(tkn) for tkn in tokens]
return tokens
评论列表
文章目录