def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
''' This function will extract text of a specific POS sequence rather than just Noun Phrase '''
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group)
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates
if cand not in stop_words and not all(char in punct for char in cand)]
key_extractor.py 文件源码
python
阅读 28
收藏 0
点赞 0
评论 0
评论列表
文章目录