keyphrase.py 文件源码-python代码片段

keyphrase.py 文件源码

python

阅读 27 收藏 0 点赞 0 评论 0

项目：minke 作者: DistrictDataLabs 项目源码文件源码

def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
    """
    Extracts key chunks based on a grammar for a list of tokenized sentences.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)
    chunker    = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Parse with the chunker if we have a tagged sentence
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract candidate phrases from our parsed chunks
        chunks = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda (word, pos, chunk): chunk != 'O'
            ) if key
        ]

        # Yield candidates that are not filtered by stopwords and punctuation.
        for chunk in normalizer.normalize(chunks):
            yield chunk