def noun_phrases_as_tokens(text):
'''Generate a bag of lists of unnormalized tokens representing noun
phrases from ``text``.
This is built around python's nltk library for getting Noun
Phrases (NPs). This is all documented in the NLTK Book
http://www.nltk.org/book/ch03.html and blog posts that cite the
book.
:rtype: list of lists of strings
'''
## from NLTK Book:
sentence_re = r'''(?x) # set flag to allow verbose regexps
([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
## From Su Nam Kim paper:
## http://www.comp.nus.edu.sg/~kanmy/papers/10.1007_s10579-012-9210-3.pdf
grammar = r'''
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
'''
if len(text.strip()) == 0:
return []
chunker = nltk.RegexpParser(grammar)
toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
#print postoks
tree = chunker.parse(postoks)
stops = stopwords.words('english')
stops += dossier_stopwords()
## These next four functions are standard uses of NLTK illustrated by
## http://alexbowe.com/au-naturale/
## https://gist.github.com/alexbowe/879414
def leaves(tree):
'''Finds NP (nounphrase) leaf nodes of a chunk tree.'''
for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
yield subtree.leaves()
def acceptable_word(word):
'''Checks conditions for acceptable word: length, stopword.'''
return 2 <= len(word) <= 40 and word.lower() not in stops
def get_terms(tree):
for leaf in leaves(tree):
yield [w for w,t in leaf if acceptable_word(w)]
return list(get_terms(tree))
评论列表
文章目录