def label_nounphrases(self):
""" Label noun phrases in the output from pos chunking. """
grammar = '''
NP: {<DT|PRP\$> <VBG> <NN.*>+}
{<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
{<DT|PRP\$>? <JJ>* <NN.*>+ }
'''
cp = nltk.RegexpParser(grammar)
result = cp.parse(self.pos)
ptree = nltk.tree.ParentedTree.convert(result)
subtrees = ptree.subtrees(filter=lambda x: x.label() == 'NP')
# build up mapping dict - if not in dict add new entry id+1;
# if in dict label using key
mapping_dict = {}
pos_to_np = {}
for st in subtrees:
np_string = " ".join(
[
leaf[0] for leaf in st.leaves()
if leaf[1] != ("DT" or "PRP$")
]
)
np_id = mapping_dict.get(np_string, None)
if not np_id:
# put ends_with here
nps = [i[0] for i in mapping_dict.items()]
ends_with_list = [
np for np in nps if ends_with(np_string, np)
]
if ends_with_list:
np_id = mapping_dict[ends_with_list[0]]
else:
np_id = len(mapping_dict)+1
mapping_dict[np_string] = np_id
pos_to_np[st.parent_index()] = np_id
# Label Tree with entities
flat_list = []
for i in range(0, len(ptree)):
# print(i)
# Label
if isinstance(ptree[i], nltk.tree.Tree):
for leaf in ptree[i].leaves():
# Unpack leaf and add label as triple
flat_list.append((leaf[0], leaf[1], pos_to_np.get(i, "")))
else:
flat_list.append(
(ptree[i][0], ptree[i][1], pos_to_np.get(i, ""))
)
return (flat_list, mapping_dict)
评论列表
文章目录