def nltk_parse_clause(sentence):
"""
Natural Language Toolkit: code_cascaded_chunker
http://www.nltk.org/book/ch07.html#code-cascaded-chunker
"""
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
cp = nltk.RegexpParser(grammar)
#sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
parsed_sentence = cp.parse(sentence)
#print('parsed_sentence=', parsed_sentence)
python类RegexpParser()的实例源码
def extract_chunks(text_string,max_words=3,lemmatize=False):
# Any number of adjectives followed by any number of nouns and (optionally) again
# any number of adjectives folowerd by any number of nouns
grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
# Makes chunks using grammar regex
chunker = nltk.RegexpParser(grammar)
# Get grammatical functions of words
# What this is doing: tag(sentence -> words)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))
# Make chunks from the sentences, using grammar. Output in IOB.
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# Join phrases based on IOB syntax.
candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]
# Filter by maximum keyphrase length
candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))
# Filter phrases consisting of punctuation or stopwords
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates))
# lemmatize
if lemmatize:
lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
candidates = [lemmatizer(x) for x in candidates]
return candidates
def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""):
try:
tagged = pos_tag(word_tokenize(sent))
#Maybe actually better if possessives aren't included.
#At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is
#probably not enough information to identify a data source
chunkParser = RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
chunks = []
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
chunk = ""
for leave in subtree.leaves():
chunk += leave[0] + ' '
chunks.append(chunk.strip())
return chunked, chunks
except Exception as e:
print(str(e))
def setup_extractor(self):
self.splitter = PunktSentenceSplitter(self.language)
grammar = self.grammars.get(self.language)
if grammar:
self.parser = RegexpParser(grammar)
else:
raise ValueError(
"Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
self.language, self.grammars.keys())
)
for lemma, match_tokens in self.lemma_to_token.iteritems():
self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
extract_ReVerb_patterns_PT.py 文件源码
项目:information-extraction-PT
作者: davidsbatista
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def main():
verb = "<ADV>*<AUX>*<VERB><PART>*<ADV>*"
word = "<NOUN|ADJ|ADV|DET|ADP>"
preposition = "<ADP|ADJ>"
rel_pattern = "( %s (%s* (%s)+ )? )+ " % (verb, word, preposition)
grammar_long = '''REL_PHRASE: {%s}''' % rel_pattern
print grammar_long
reverb_pattern = nltk.RegexpParser(grammar_long)
# test_patterns(reverb_pattern)
process_chave(reverb_pattern)
def determine_entities(self):
""" Determines noun entities within a patent claim.
param: pos - list of tuples from nltk pos tagger"""
# Define grammar for chunking
grammar = '''
NP: {<DT|PRP\$> <VBG> <NN.*>+}
{<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
{<DT|PRP\$>? <JJ>* <NN.*>+ }
'''
cp = nltk.RegexpParser(grammar)
# Or store as part of claim object property?
# Option: split into features / clauses, run over clauses and
# then re-correlate
return cp.parse(self.pos)
def find_chunk(sent, chunk_rule=None):
if not chunk_rule:
chunk_rule = 'QWORD: <W.*><V.*><DT>*{<.*>*?<N.*>+}'
logger.debug(chunk_rule)
label=chunk_rule.split(':')[0].strip()
cp = nltk.RegexpParser(chunk_rule)
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == label:
subtree = ' '.join([a[0] for a in subtree ])
return subtree
def find_chunk(sent,chunk_rule=None):
if not chunk_rule:
chunk_rule = 'HCHUNK: <W.*><.*>*?{<N.*>+}'
label=chunk_rule.split(':')[0].strip()
cp = nltk.RegexpParser(chunk_rule)
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == label:
subtree = ' '.join([a[0] for a in subtree ])
print (subtree)
return subtree
##this is required only once
def get_parse_info(parsestr, stemmer, language, stoplist):
hash_token_pos = OrderedDict()
if language=='german':
grammar = r"""
NBAR:
{<N.*|ADJ.*>*<N.*>} # Nouns and Adjectives, terminated with Nouns
VP:
{<V.*>} # terminated with Verbs
NP:
{<NBAR>}
{<NBAR><APPR><NBAR>} # Above, connected with in/of/etc...
"""
if language=='english':
#Taken from Su Nam Kim Paper...
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
VP:
{<V.*>} # terminated with Verbs
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = RegexpParser(grammar)
postoks = []
for i in Tree.fromstring(parsestr).subtrees():
if i.height() == 2:
word, pos = i[0], i.label()
hash_token_pos[stemmer.stem(word)] = word + u"::" + pos
postoks.append((word, pos))
chunk_tree = chunker.parse(postoks)
phrases = get_terms(chunk_tree, stemmer, stoplist)
phrase_list = [ ' '.join(term) for term in phrases if term]
return hash_token_pos, phrase_list
def drawNamedEntityTree(self, text):
tokenized_text = self.tokenizer.tokenize(text)
tagged_text = self.tagWords(tokenized_text)
grammar = "ENT: {<PESSOA>*}"
cp = RegexpParser(grammar)
res = cp.parse(tagged_text)
res.draw()
# Tokenizar sentenas em palavras. Retorna uma lista com as palavras que formam o texto.
def drawNamedEntityTree(self, text):
tokenized_text = self.tokenizer.tokenize(text)
tagged_text = self.tagWords(tokenized_text)
grammar = "ENT: {<PESSOA>*}"
cp = RegexpParser(grammar)
res = cp.parse(tagged_text)
res.draw()
# Tokenizar sentenas em palavras. Retorna uma lista com as palavras que formam o texto.
def fetch_all_organizations(resume_text):
organizations = set()
tokenized_sentences = nltk.sent_tokenize(resume_text)
# Custom grammar with NLTK
# NP - Noun Phrase
# NN - Noun
# NNP - Proper Noun
# V - Verb
# JJ - Adjective
# In a sentence that contains NN NNNP V NN NN JJ NN.
# The noun-phrases fetched are:
# NP: NN NNP
# NP: NN NN
# NP: NN
# Ex, "Application Developer at Delta Force"
# => ["Application Developer", "Delta Force"]
grammar = r"""NP: {<NN|NNP>+}"""
parser = nltk.RegexpParser(grammar)
avoid_organizations = utilities.get_avoid_organizations()
for sentence in tokenized_sentences:
# tags all parts of speech in the tokenized sentences
tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))
# then chunks with customize grammar
# np_chunks are instances of class nltk.tree.Tree
np_chunks = parser.parse(tagged_words)
noun_phrases = []
for np_chunk in np_chunks:
if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
# if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
noun_phrase = ""
for (org, tag) in np_chunk.leaves():
noun_phrase += org + ' '
noun_phrases.append(noun_phrase.rstrip())
# Using name entity chunker to get all the organizations
chunks = nltk.ne_chunk(tagged_words)
for chunk in chunks:
if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
(organization, tag) = chunk[0]
# if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
# eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
for noun_phrase in noun_phrases:
if organization in noun_phrase and organization not in avoid_organizations:
organizations.add(noun_phrase.capitalize())
return organizations
def label_nounphrases(self):
""" Label noun phrases in the output from pos chunking. """
grammar = '''
NP: {<DT|PRP\$> <VBG> <NN.*>+}
{<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
{<DT|PRP\$>? <JJ>* <NN.*>+ }
'''
cp = nltk.RegexpParser(grammar)
result = cp.parse(self.pos)
ptree = nltk.tree.ParentedTree.convert(result)
subtrees = ptree.subtrees(filter=lambda x: x.label() == 'NP')
# build up mapping dict - if not in dict add new entry id+1;
# if in dict label using key
mapping_dict = {}
pos_to_np = {}
for st in subtrees:
np_string = " ".join(
[
leaf[0] for leaf in st.leaves()
if leaf[1] != ("DT" or "PRP$")
]
)
np_id = mapping_dict.get(np_string, None)
if not np_id:
# put ends_with here
nps = [i[0] for i in mapping_dict.items()]
ends_with_list = [
np for np in nps if ends_with(np_string, np)
]
if ends_with_list:
np_id = mapping_dict[ends_with_list[0]]
else:
np_id = len(mapping_dict)+1
mapping_dict[np_string] = np_id
pos_to_np[st.parent_index()] = np_id
# Label Tree with entities
flat_list = []
for i in range(0, len(ptree)):
# print(i)
# Label
if isinstance(ptree[i], nltk.tree.Tree):
for leaf in ptree[i].leaves():
# Unpack leaf and add label as triple
flat_list.append((leaf[0], leaf[1], pos_to_np.get(i, "")))
else:
flat_list.append(
(ptree[i][0], ptree[i][1], pos_to_np.get(i, ""))
)
return (flat_list, mapping_dict)
def noun_phrases_as_tokens(text):
'''Generate a bag of lists of unnormalized tokens representing noun
phrases from ``text``.
This is built around python's nltk library for getting Noun
Phrases (NPs). This is all documented in the NLTK Book
http://www.nltk.org/book/ch03.html and blog posts that cite the
book.
:rtype: list of lists of strings
'''
## from NLTK Book:
sentence_re = r'''(?x) # set flag to allow verbose regexps
([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
## From Su Nam Kim paper:
## http://www.comp.nus.edu.sg/~kanmy/papers/10.1007_s10579-012-9210-3.pdf
grammar = r'''
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
'''
if len(text.strip()) == 0:
return []
chunker = nltk.RegexpParser(grammar)
toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
#print postoks
tree = chunker.parse(postoks)
stops = stopwords.words('english')
stops += dossier_stopwords()
## These next four functions are standard uses of NLTK illustrated by
## http://alexbowe.com/au-naturale/
## https://gist.github.com/alexbowe/879414
def leaves(tree):
'''Finds NP (nounphrase) leaf nodes of a chunk tree.'''
for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
yield subtree.leaves()
def acceptable_word(word):
'''Checks conditions for acceptable word: length, stopword.'''
return 2 <= len(word) <= 40 and word.lower() not in stops
def get_terms(tree):
for leaf in leaves(tree):
yield [w for w,t in leaf if acceptable_word(w)]
return list(get_terms(tree))