def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
print "text_to_sentence"
#from nltk.tokenize import wordpunct_tokenize
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
text=text.decode("utf8")
from nltk.tokenize import sent_tokenize,wordpunct_tokenize
# 1. Use the NLTK tokenizer to split the paragraph into sentences
#raw_sentences = tokenizer.tokenize(text.strip())
raw_sentences = sent_tokenize(text.strip())
print "finish tokenize sentence",len(raw_sentences)
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
#print "sentence:",raw_sentence
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
#sentences.append( text_to_wordlist( raw_sentence, \
# remove_stopwords ))
#print removePunctuation(raw_sentence).lower().split()
print raw_sentence
sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
print wordpunct_tokenize(raw_sentence)
#print text_to_wordlist( raw_sentence, remove_stopwords )
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
评论列表
文章目录