def tokenizeDocument(document):
# remove punctuation (otherwise we have a bunch of empty tokens at the end)
translate_table = dict((ord(char), " ") for char in string.punctuation)
document = document.translate(translate_table)
# tokenize
tokenized_doc = nltk.word_tokenize(document)
# stem
snowball = stem.snowball.EnglishStemmer()
tokenized_doc = [snowball.stem(word) for word in tokenized_doc]
# remove stop words
tokenized_doc = [word for word in tokenized_doc if word not in stopwords.words('english')]
return tokenized_doc
# given the dictionary, return an array of all the tokenized comments
评论列表
文章目录