def tokenize_and_stem(text):
"""
First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
"""
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
if 'intern' == token:
token = ''
if 'student' == token:
token = ''
if 'and' == token:
token = ''
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
return stems
评论列表
文章目录