claim.py 文件源码

python
阅读 37 收藏 0 点赞 0 评论 0

项目:patentdata 作者: benhoyle 项目源码 文件源码
def label_nounphrases(self):
        """ Label noun phrases in the output from pos chunking. """
        grammar = '''
            NP: {<DT|PRP\$> <VBG> <NN.*>+}
                {<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
                {<DT|PRP\$>? <JJ>* <NN.*>+ }
            '''

        cp = nltk.RegexpParser(grammar)
        result = cp.parse(self.pos)
        ptree = nltk.tree.ParentedTree.convert(result)
        subtrees = ptree.subtrees(filter=lambda x: x.label() == 'NP')

        # build up mapping dict - if not in dict add new entry id+1;
        # if in dict label using key
        mapping_dict = {}
        pos_to_np = {}
        for st in subtrees:
            np_string = " ".join(
                [
                    leaf[0] for leaf in st.leaves()
                    if leaf[1] != ("DT" or "PRP$")
                ]
            )
            np_id = mapping_dict.get(np_string, None)
            if not np_id:
                # put ends_with here
                nps = [i[0] for i in mapping_dict.items()]
                ends_with_list = [
                    np for np in nps if ends_with(np_string, np)
                ]
                if ends_with_list:
                    np_id = mapping_dict[ends_with_list[0]]
                else:
                    np_id = len(mapping_dict)+1
                    mapping_dict[np_string] = np_id
            pos_to_np[st.parent_index()] = np_id

        # Label Tree with entities
        flat_list = []
        for i in range(0, len(ptree)):
            # print(i)
            # Label
            if isinstance(ptree[i], nltk.tree.Tree):
                for leaf in ptree[i].leaves():
                    # Unpack leaf and add label as triple
                    flat_list.append((leaf[0], leaf[1], pos_to_np.get(i, "")))
            else:
                flat_list.append(
                    (ptree[i][0], ptree[i][1], pos_to_np.get(i, ""))
                )
        return (flat_list, mapping_dict)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号