kpex.py 文件源码-python代码片段

def extract_chunks(text_string,max_words=3,lemmatize=False):

    # Any number of adjectives followed by any number of nouns and (optionally) again
    # any number of adjectives folowerd by any number of nouns
    grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'

    # Makes chunks using grammar regex
    chunker = nltk.RegexpParser(grammar)

    # Get grammatical functions of words
    # What this is doing: tag(sentence -> words)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))

    # Make chunks from the sentences, using grammar. Output in IOB.
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                        for tagged_sent in tagged_sents))
    # Join phrases based on IOB syntax.
    candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]

    # Filter by maximum keyphrase length
    candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))

    # Filter phrases consisting of punctuation or stopwords
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates))

    # lemmatize
    if lemmatize:
        lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
        candidates =  [lemmatizer(x) for x in candidates]

    return candidates