keyphrase_extraction.py 文件源码-python代码片段

keyphrase_extraction.py 文件源码

python

阅读 29 收藏 0 点赞 0 评论 0

项目：text-analytics-with-python 作者: dipanjanS 项目源码文件源码

def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}'):

    all_chunks = []
    chunker = nltk.chunk.regexp.RegexpParser(grammar)

    for sentence in sentences:

        tagged_sents = nltk.pos_tag_sents(
                            [nltk.word_tokenize(sentence)])

        chunks = [chunker.parse(tagged_sent) 
                  for tagged_sent in tagged_sents]

        wtc_sents = [nltk.chunk.tree2conlltags(chunk)
                     for chunk in chunks]    

        flattened_chunks = list(
                            itertools.chain.from_iterable(
                                wtc_sent for wtc_sent in wtc_sents)
                           )

        valid_chunks_tagged = [(status, [wtc for wtc in chunk]) 
                        for status, chunk 
                        in itertools.groupby(flattened_chunks, 
                                             lambda (word,pos,chunk): chunk != 'O')]

        valid_chunks = [' '.join(word.lower() 
                                for word, tag, chunk 
                                in wtc_group 
                                    if word.lower() 
                                        not in stopword_list) 
                                    for status, wtc_group 
                                    in valid_chunks_tagged
                                        if status]

        all_chunks.append(valid_chunks)

    return all_chunks