def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False):
# Create the chunker that uses our grammar
chunker = RegexpParser(grammar)
for sent in sents:
# Tokenize and tag sentences if necessary
if not tagged:
sent = nltk.pos_tag(nltk.word_tokenize(sent))
# Parse the sentence, converting the parse tree into a tagged sequence
sent = normalize(sent)
if not sent: continue
chunks = tree2conlltags(chunker.parse(sent))
# Extract phrases and rejoin them with space
phrases = [
" ".join(word for word, pos, chunk in group).lower()
for key, group in groupby(
chunks, lambda term: term[-1] != 'O'
) if key
]
for phrase in phrases:
yield phrase
评论列表
文章目录