def clean_text(raw_text, filtered_word_types):
"""Clean raw text for bag-of-words model"""
# Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
# Convert to lower case, split into individual words
words = letters_only.lower().split()
# stem words
stemmer = PorterStemmer()
stemmed_words = list(map(stemmer.stem, words))
# Remove stop words if requested
if filtered_word_types is not None:
tagged_text = nltk.pos_tag(stemmed_words)
stemmed_words = [w for w, wtype in tagged_text if not wtype in filtered_word_types]
# join together
return " ".join(stemmed_words)
评论列表
文章目录