def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False):
# Create the chunker that uses our grammar
chunker = RegexpParser(grammar)
for sent in sents:
# Tokenize and tag sentences if necessary
if not tagged:
sent = nltk.pos_tag(nltk.word_tokenize(sent))
# Parse the sentence, converting the parse tree into a tagged sequence
sent = normalize(sent)
if not sent: continue
chunks = tree2conlltags(chunker.parse(sent))
# Extract phrases and rejoin them with space
phrases = [
" ".join(word for word, pos, chunk in group).lower()
for key, group in groupby(
chunks, lambda term: term[-1] != 'O'
) if key
]
for phrase in phrases:
yield phrase
python类chunk()的实例源码
def measure_pattern_time_v2(iteration_number, size, pattern):
gw = execnet.makegateway("popen//python=python2.7")
channel = gw.remote_exec("""
from nltk.corpus import brown
words = brown.words()[:%s]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
tokenize = True, # Split punctuation marks from words?
tags = True, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = False, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = False, # Parse lemmata? (ate => eat)
encoding = 'utf-8', # Input string encoding.
tagset = None) # Penn Treebank II (default) or UNIVERSAL.
from pattern.search import search
def measure_pattern_search():
global pattern_search_result #Make measure_me able to modify the value
pattern_search_result = search("%s", text_tree)
from timeit import Timer
pattern_search_time = Timer(measure_pattern_search)
def pattern_search_timeit():
runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)]
average = sum(runtimes)/len(runtimes)
# return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))])
return [runtimes, average, min(runtimes)]
channel.send(pattern_search_timeit())
""" % (size, pattern, iteration_number, iteration_number))
channel.send([])
return channel.receive()
def pyrata2conll (dictList, **kwargs):
"""
See 3.1 Reading IOB Format and the CoNLL 2000 Corpus http://www.nltk.org/book/ch07.html
can be used wi
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()
"""
if 'raw' in kwargs.keys():
rawFeatureName = kwargs['raw']
if 'pos' in kwargs.keys():
posFeatureName = kwargs['pos']
if 'chunk' in kwargs.keys():
chunkFeatureName = kwargs['chunk']
text = ''
for e in dictList:
text.append(' '.join([e[rawFeatureName], e[posFeatureName], e[chunkFeatureName], '\n']))
return text
# extend a given dictList
# merge dictList
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Run all the tests
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
"""
Extracts key chunks based on a grammar for a list of tokenized sentences.
If the sentences are already tokenized and tagged, pass in: tagged=True
"""
normalizer = Normalizer(**kwargs)
chunker = RegexpParser(grammar)
for sent in sents:
# Tokenize and tag sentences if necessary
if not tagged:
sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))
# Parse with the chunker if we have a tagged sentence
if not sent: continue
chunks = tree2conlltags(chunker.parse(sent))
# Extract candidate phrases from our parsed chunks
chunks = [
" ".join(word for word, pos, chunk in group).lower()
for key, group in groupby(
chunks, lambda (word, pos, chunk): chunk != 'O'
) if key
]
# Yield candidates that are not filtered by stopwords and punctuation.
for chunk in normalizer.normalize(chunks):
yield chunk