PhraseMaker.py 文件源码-python代码片段

def create_phrase(self, phrase_str): 

        tokenized_phrase = nltk.word_tokenize(phrase_str)
        tagged_phrase = nltk.pos_tag(tokenized_phrase)

        ne_chunk_tree = nltk.ne_chunk(tagged_phrase)
        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_tree))

        merge_tokens = self._find_multi_token_nnp(ne_chunk_tree) 

        ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens)        

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_list))        

        tokens = [] #list of tagged tuples
        for token in ne_chunk_list:
            if type(token) is nltk.tree.Tree:            
                tokens.append(self._tree_to_tuple(token))
            else:
                if (token[0] in self._keywords):                
                    token = (token[0], self._keywords[token[0]])
                tokens.append(token)

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(tokens))  

        phrase = Phrase(tokens)    
        return phrase 

    #input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples
    #output: list of tuples/trees containing nltk tokens
    #purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list