def tokenize(text): stem = nltk.stem.SnowballStemmer('english') text = text.lower() for token in nltk.word_tokenize(text): if token in string.punctuation: continue yield stem.stem(token) # The corpus object