def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""):
try:
tagged = pos_tag(word_tokenize(sent))
#Maybe actually better if possessives aren't included.
#At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is
#probably not enough information to identify a data source
chunkParser = RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
chunks = []
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
chunk = ""
for leave in subtree.leaves():
chunk += leave[0] + ' '
chunks.append(chunk.strip())
return chunked, chunks
except Exception as e:
print(str(e))
评论列表
文章目录