def tokenize_me(file_text):
#firstly let's apply nltk tokenization
tokens = nltk.word_tokenize(file_text)
#let's delete punctuation symbols
tokens = [i for i in tokens if i not in string.punctuation]
#deleting stop_words
tokens = [i for i in tokens if i not in stop_words]
#cleaning words
tokens = [i.replace("«", "").replace("»", "") for i in tokens]
tokens = [stemmer.stem(i) for i in tokens]
return set(tokens)
评论列表
文章目录