re_util.py 文件源码-python代码片段

re_util.py 文件源码

python

阅读 34 收藏 0 点赞 0 评论 0

def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""):
    try:
        tagged = pos_tag(word_tokenize(sent))
        #Maybe actually better if possessives aren't included.
        #At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is
        #probably not enough information to identify a data source
        chunkParser = RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        chunks = []
        for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
            chunk = ""
            for leave in subtree.leaves():
                chunk += leave[0] + ' '
            chunks.append(chunk.strip())
        return chunked, chunks
    except Exception as e:
        print(str(e))