def create_phrase(self, phrase_str):
tokenized_phrase = nltk.word_tokenize(phrase_str)
tagged_phrase = nltk.pos_tag(tokenized_phrase)
ne_chunk_tree = nltk.ne_chunk(tagged_phrase)
#if (line_num in bluh):
#print(str(line_num)+". "+str(ne_chunk_tree))
merge_tokens = self._find_multi_token_nnp(ne_chunk_tree)
ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens)
#if (line_num in bluh):
#print(str(line_num)+". "+str(ne_chunk_list))
tokens = [] #list of tagged tuples
for token in ne_chunk_list:
if type(token) is nltk.tree.Tree:
tokens.append(self._tree_to_tuple(token))
else:
if (token[0] in self._keywords):
token = (token[0], self._keywords[token[0]])
tokens.append(token)
#if (line_num in bluh):
#print(str(line_num)+". "+str(tokens))
phrase = Phrase(tokens)
return phrase
#input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples
#output: list of tuples/trees containing nltk tokens
#purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list
评论列表
文章目录