def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}'):
all_chunks = []
chunker = nltk.chunk.regexp.RegexpParser(grammar)
for sentence in sentences:
tagged_sents = nltk.pos_tag_sents(
[nltk.word_tokenize(sentence)])
chunks = [chunker.parse(tagged_sent)
for tagged_sent in tagged_sents]
wtc_sents = [nltk.chunk.tree2conlltags(chunk)
for chunk in chunks]
flattened_chunks = list(
itertools.chain.from_iterable(
wtc_sent for wtc_sent in wtc_sents)
)
valid_chunks_tagged = [(status, [wtc for wtc in chunk])
for status, chunk
in itertools.groupby(flattened_chunks,
lambda (word,pos,chunk): chunk != 'O')]
valid_chunks = [' '.join(word.lower()
for word, tag, chunk
in wtc_group
if word.lower()
not in stopword_list)
for status, wtc_group
in valid_chunks_tagged
if status]
all_chunks.append(valid_chunks)
return all_chunks
keyphrase_extraction.py 文件源码
python
阅读 19
收藏 0
点赞 0
评论 0
评论列表
文章目录