utilities.py 文件源码-python代码片段

def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
        print "text_to_sentence"
        #from nltk.tokenize import wordpunct_tokenize
        # Function to split a review into parsed sentences. Returns a 
        # list of sentences, where each sentence is a list of words
        #
        text=text.decode("utf8")
        from nltk.tokenize import sent_tokenize,wordpunct_tokenize
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        #raw_sentences = tokenizer.tokenize(text.strip())
        raw_sentences = sent_tokenize(text.strip())
        print "finish tokenize sentence",len(raw_sentences)
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:

            #print "sentence:",raw_sentence
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                #sentences.append( text_to_wordlist( raw_sentence, \
    #               remove_stopwords ))
                #print removePunctuation(raw_sentence).lower().split()
                print raw_sentence
                sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
                print wordpunct_tokenize(raw_sentence)
                #print  text_to_wordlist( raw_sentence, remove_stopwords )
        #    
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences