def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
"""
Extracts key chunks based on a grammar for a list of tokenized sentences.
If the sentences are already tokenized and tagged, pass in: tagged=True
"""
normalizer = Normalizer(**kwargs)
chunker = RegexpParser(grammar)
for sent in sents:
# Tokenize and tag sentences if necessary
if not tagged:
sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))
# Parse with the chunker if we have a tagged sentence
if not sent: continue
chunks = tree2conlltags(chunker.parse(sent))
# Extract candidate phrases from our parsed chunks
chunks = [
" ".join(word for word, pos, chunk in group).lower()
for key, group in groupby(
chunks, lambda (word, pos, chunk): chunk != 'O'
) if key
]
# Yield candidates that are not filtered by stopwords and punctuation.
for chunk in normalizer.normalize(chunks):
yield chunk
评论列表
文章目录