def get_sentence_tokens(text):
'''
Given a text(review), return the token list of each sentence
:param text:
:return:
'''
sentences = sent_tokenize(text)
sent_tokens = []
for sentence in sentences:
sent_token = word_tokenize(sentence)
sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))]
sent_tokens.append(sent_token)
# remove stop words and short tokens
# stemming, experiment shows that stemming works nothing...
# if (stemming):
# stemmer = PorterStemmer()
# texts = [[ stemmer.stem(token) for token in text] for text in texts]
return sent_tokens
评论列表
文章目录