def extract_chunks(text_string,max_words=3,lemmatize=False):
# Any number of adjectives followed by any number of nouns and (optionally) again
# any number of adjectives folowerd by any number of nouns
grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
# Makes chunks using grammar regex
chunker = nltk.RegexpParser(grammar)
# Get grammatical functions of words
# What this is doing: tag(sentence -> words)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))
# Make chunks from the sentences, using grammar. Output in IOB.
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# Join phrases based on IOB syntax.
candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]
# Filter by maximum keyphrase length
candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))
# Filter phrases consisting of punctuation or stopwords
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates))
# lemmatize
if lemmatize:
lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
candidates = [lemmatizer(x) for x in candidates]
return candidates
评论列表
文章目录