def __init__(self, chunk_struct, debug_level=1):
"""
Construct a new ``ChunkString`` that encodes the chunking of
the text ``tagged_tokens``.
:type chunk_struct: Tree
:param chunk_struct: The chunk structure to be further chunked.
:type debug_level: int
:param debug_level: The level of debugging which should be
applied to transformations on the ``ChunkString``. The
valid levels are:
- 0: no checks
- 1: full check on to_chunkstruct
- 2: full check on to_chunkstruct and cursory check after
each transformation.
- 3: full check on to_chunkstruct and full check after
each transformation.
We recommend you use at least level 1. You should
probably use level 3 if you use any non-standard
subclasses of ``RegexpChunkRule``.
"""
self._root_label = chunk_struct.label()
self._pieces = chunk_struct[:]
tags = [self._tag(tok) for tok in self._pieces]
self._str = '<' + '><'.join(tags) + '>'
self._debug = debug_level
python类Tree()的实例源码
def parse(self, chunk_struct, trace=None):
"""
Apply the chunk parser to this input.
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
(this tree is modified, and is also returned)
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:return: the chunked output.
:rtype: Tree
"""
if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
return chunk_struct
regexp.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def parse(self, chunk_struct, trace=None):
"""
Apply the chunk parser to this input.
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
(this tree is modified, and is also returned)
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:return: the chunked output.
:rtype: Tree
"""
if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
return chunk_struct
def parse(self, chunk_struct, trace=None):
"""
Apply the chunk parser to this input.
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
(this tree is modified, and is also returned)
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:return: the chunked output.
:rtype: Tree
"""
if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
return chunk_struct
def to_nltk_tree_general(node, attr_list=("dep_", "pos_"), level=99999):
"""Tranforms a Spacy dependency tree into an NLTK tree, with certain spacy tree node attributes serving
as parts of the NLTK tree node label content for uniqueness.
Args:
node: The starting node from the tree in which the transformation will occur.
attr_list: Which attributes from the Spacy nodes will be included in the NLTK node label.
level: The maximum depth of the tree.
Returns:
A NLTK Tree (nltk.tree)
"""
# transforms attributes in a node representation
value_list = [getattr(node, attr) for attr in attr_list]
node_representation = "/".join(value_list)
if level == 0:
return node_representation
if node.n_lefts + node.n_rights > 0:
return Tree(node_representation, [to_nltk_tree_general(child, attr_list, level-1) for child in node.children])
else:
return node_representation
def get_node_representation(tetre_format, token):
"""Given a format and a SpaCy node (spacy.token), returns this node representation using the NLTK tree (nltk.tree).
It recursivelly builds a NLTK tree and returns it, not only the node itself.
Args:
tetre_format: The attributes of this node that will be part of its string representation.
token: The SpaCy node itself (spacy.token).
Returns:
A NLTK Tree (nltk.tree)
"""
params = tetre_format.split(",")
node_representation = token.pos_
if token.n_lefts + token.n_rights > 0:
tree = Tree(node_representation,
[to_nltk_tree_general(child, attr_list=params, level=0) for child in token.children])
else:
tree = Tree(node_representation, [])
return tree
def nltk_tree_to_qtree(tree):
"""Transforms a NLTK Tree in a QTREE. A QTREE is a string representation of a tree.
For details, please see: http://www.ling.upenn.edu/advice/latex/qtree/qtreenotes.pdf
Args:
tree: The NLTK Tree (nltk.tree).
Returns:
A string with the QTREE representation of the NLTK Tree (nltk.tree).
"""
self_result = " [ "
if isinstance(tree, Tree):
self_result += " " + tree.label() + " "
if len(tree) > 0:
self_result += " ".join([nltk_tree_to_qtree(node) for node in sorted(tree)])
else:
self_result += " " + str(tree) + " "
self_result += " ] "
return self_result
def rightBinarize(tr):
children = []
for child in tr:
children.append(child)
tmpNode = children[-1]
i = len(children) - 2
while i > 0:
tmpNode2 = nltk.Tree("(X)")
tmpNode2.append(children[i])
tmpNode2.append(tmpNode)
tmpNode = tmpNode2
i -= 1
while len(tr) > 1:
tr.pop()
tr.append(tmpNode)
def leftBinarize(tr):
children = []
for child in tr:
children.append(child)
tmpNode = children[0]
i = 1
while i < len(children) - 1:
tmpNode2 = nltk.Tree("(X)")
tmpNode2.append(tmpNode)
tmpNode2.append(children[i])
tmpNode = tmpNode2
i += 1
while len(tr) > 1:
tr.pop(0)
tr.insert(0, tmpNode)
def vvBinarize(tr):
children = []
vvIndex = None
for i, child in enumerate(tr):
children.append(child)
if child.node in vvTags:
vvIndex = i
if vvIndex == None:
print >> sys.stderr, "no vv in the children!!!",
output(tr)
return
tmpNode = nltk.Tree("(X)")
for i in xrange(vvIndex, len(tr)):
tmpNode.append(children[i])
leftBinarize(tmpNode)
while len(tr) > vvIndex:
tr.pop()
tr.append(tmpNode)
rightBinarize(tr)
def parse(self, chunk_struct, trace=None):
"""
Apply the chunk parser to this input.
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
(this tree is modified, and is also returned)
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:return: the chunked output.
:rtype: Tree
"""
if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
return chunk_struct
def match_rules_context(tree, rules, parent_context={}):
"""Recursively matches a Tree structure with rules and returns context
Args:
tree (Tree): Parsed tree structure
rules (dict): See match_rules
parent_context (dict): Context of parent call
Returns:
dict: Context matched dictionary of matched rules or
None if no match
"""
for template, match_rules in rules.items():
context = parent_context.copy()
if match_template(tree, template, context):
for key, child_rules in match_rules.items():
child_context = match_rules_context(context[key], child_rules, context)
if child_context:
for k, v in child_context.items():
context[k] = v
else:
return None
return context
return None
def match_rules_context_multi(tree, rules, parent_context={}):
"""Recursively matches a Tree structure with rules and returns context
Args:
tree (Tree): Parsed tree structure
rules (dict): See match_rules
parent_context (dict): Context of parent call
Returns:
dict: Context matched dictionary of matched rules or
None if no match
"""
all_contexts = []
for template, match_rules in rules.items():
context = parent_context.copy()
if match_template(tree, template, context):
child_contextss = []
if not match_rules:
all_contexts += [context]
else:
for key, child_rules in match_rules.items():
child_contextss.append(match_rules_context_multi(context[key], child_rules, context))
all_contexts += cross_context(child_contextss)
return all_contexts
def match_template(tree, template, args=None):
"""Check if match string matches Tree structure
Args:
tree (Tree): Parsed Tree structure of a sentence
template (str): String template to match. Example: "( S ( NP ) )"
Returns:
bool: If they match or not
"""
tokens = get_tokens(template.split())
cur_args = {}
if match_tokens(tree, tokens, cur_args):
if args is not None:
for k, v in cur_args.items():
args[k] = v
logger.debug('MATCHED: {0}'.format(template))
return True
else:
return False
def get_object(tree):
"""Get the object in the tree object.
Method should remove unnecessary letters and words::
the
a/an
's
Args:
tree (Tree): Parsed tree structure
Returns:
Resulting string of tree ``(Ex: "red car")``
"""
if isinstance(tree, Tree):
if tree.label() == 'DT' or tree.label() == 'POS':
return ''
words = []
for child in tree:
words.append(get_object(child))
return ' '.join([_f for _f in words if _f])
else:
return tree
def parse(self, chunk_struct, trace=None):
"""
Apply the chunk parser to this input.
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
(this tree is modified, and is also returned)
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:return: the chunked output.
:rtype: Tree
"""
if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
return chunk_struct
def get_continuous_chunks(self, text):
chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for i in chunked:
if type(i) == nltk.Tree:
current_chunk.append(" ".join([token for token, pos in i.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
def parse(self, chunk_struct, trace=None):
"""
Apply the chunk parser to this input.
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
(this tree is modified, and is also returned)
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:return: the chunked output.
:rtype: Tree
"""
if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
return chunk_struct
def _to_nltk_format(self):
from nltk import Tree
return Tree(self.parent_relation,
[Tree(self.pos,
[self.word] + [c._to_nltk_format() for c in self.children] )])
# from nltk import Tree
# label = "({0}) {1} ({2})".format(self.parent_relation,self.word,self.pos)
# if not self.children:
# return label
# return Tree(label,[c._to_nltk_format() for c in self.children])
# Feature functions, should conform to naming _(PREDICATE/ARGUMENT)_FEATURE_(feature_name)
# and return a tuple of (value,span)
#return the head of the
def find_tree_matches(tree,pat):
"""
Get all subtrees matching pattern
@type tree: DepTree
@param tree: tree in which to search for matches
@type pat: nltk.Tree
@param pat: a pattern to match against tree
@rtype: list [unification of pat]
@return: all possible unification of pat in tree
"""
ret = []
curMatch = tree.match(pat)
if curMatch:
ret.append(curMatch)
for c in tree.children:
ret.extend(find_tree_matches(c,pat))
return ret
def parse(self, chunk_struct, trace=None):
"""
Apply the chunk parser to this input.
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
(this tree is modified, and is also returned)
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:return: the chunked output.
:rtype: Tree
"""
if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
return chunk_struct
def parse(self, chunk_struct, trace=None):
"""
Apply the chunk parser to this input.
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
(this tree is modified, and is also returned)
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:return: the chunked output.
:rtype: Tree
"""
if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
return chunk_struct
def bft(tree):
""" Perform a breadth-first traversal of a tree.
Return the nodes in a list in level-order.
Args:
tree: a tree node
Returns:
lst: a list of tree nodes in left-to-right level-order
"""
lst = []
queue = Queue.Queue()
queue.put(tree)
while not queue.empty():
node = queue.get()
lst.append(node)
for child in node:
if isinstance(child, nltk.Tree):
queue.put(child)
return lst
def traverse_tree(tree, pro):
""" Traverse a tree in a left-to-right, breadth-first manner,
proposing any NP encountered as an antecedent. Returns the
tree and the position of the first possible antecedent.
Args:
tree: the tree being searched
pro: the pronoun being resolved (string)
"""
# Initialize a queue and enqueue the root of the tree
queue = Queue.Queue()
queue.put(tree)
while not queue.empty():
node = queue.get()
# if the node is an NP, return it as a potential antecedent
if "NP" in node.label() and match(tree, get_pos(tree,node), pro):
return tree, get_pos(tree, node)
for child in node:
if isinstance(child, nltk.Tree):
queue.put(child)
# if no antecedent is found, return None
return None, None
def calc(param):
p = ["He", "he", "Him", "him", "She", "she", "Her",
"her", "It", "it", "They", "they"]
r = ["Himself", "himself", "Herself", "herself",
"Itself", "itself", "Themselves", "themselves"]
fname = param[1]
pro = param[2]
with open(fname) as f:
sents = f.readlines()
trees = [Tree.fromstring(s) for s in sents]
pos = get_pos(trees[-1], pro)
pos = pos[:-1]
if pro in p:
tree, pos = hobbs(trees, pos)
#for t in trees:
# print t, '\n'
#print "Proposed antecedent for '"+pro+"':", tree[pos]
return tree, tree[pos]
elif pro in r:
tree, pos = resolve_reflexive(trees, pos)
#for t in trees:
# print t, '\n'
#print "Proposed antecedent for '"+pro+"':", tree[pos]
return tree, tree[pos]
def create_xsvversion_of_tree(t):
founded, sub_tree = find_deepleftfirst_verb(t)
if sub_tree != None:
t._label = "S1"
temp = Tree("S", [sub_tree]+[t])
return temp
return t
def __init__(self, chunk_struct, debug_level=1):
"""
Construct a new ``ChunkString`` that encodes the chunking of
the text ``tagged_tokens``.
:type chunk_struct: Tree
:param chunk_struct: The chunk structure to be further chunked.
:type debug_level: int
:param debug_level: The level of debugging which should be
applied to transformations on the ``ChunkString``. The
valid levels are:
- 0: no checks
- 1: full check on to_chunkstruct
- 2: full check on to_chunkstruct and cursory check after
each transformation.
- 3: full check on to_chunkstruct and full check after
each transformation.
We recommend you use at least level 1. You should
probably use level 3 if you use any non-standard
subclasses of ``RegexpChunkRule``.
"""
self._root_label = chunk_struct.label()
self._pieces = chunk_struct[:]
tags = [self._tag(tok) for tok in self._pieces]
self._str = '<' + '><'.join(tags) + '>'
self._debug = debug_level
def _tag(self, tok):
if isinstance(tok, tuple):
return tok[1]
elif isinstance(tok, Tree):
return tok.label()
else:
raise ValueError('chunk structures must contain tagged '
'tokens or trees')
def to_chunkstruct(self, chunk_label='CHUNK'):
"""
Return the chunk structure encoded by this ``ChunkString``.
:rtype: Tree
:raise ValueError: If a transformation has generated an
invalid chunkstring.
"""
if self._debug > 0: self._verify(self._str, 1)
# Use this alternating list to create the chunkstruct.
pieces = []
index = 0
piece_in_chunk = 0
for piece in re.split('[{}]', self._str):
# Find the list of tokens contained in this piece.
length = piece.count('<')
subsequence = self._pieces[index:index+length]
# Add this list of tokens to our pieces.
if piece_in_chunk:
pieces.append(Tree(chunk_label, subsequence))
else:
pieces += subsequence
# Update index, piece_in_chunk
index += length
piece_in_chunk = not piece_in_chunk
return Tree(self._root_label, pieces)
def parse(self, chunk_struct, trace=None):
"""
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:rtype: Tree
:return: a chunk structure that encodes the chunks in a given
tagged sentence. A chunk is a non-overlapping linguistic
group, such as a noun phrase. The set of chunks
identified in the chunk structure depends on the rules
used to define this ``RegexpChunkParser``.
"""
if len(chunk_struct) == 0:
print('Warning: parsing empty text')
return Tree(self._root_label, [])
try:
chunk_struct.label()
except AttributeError:
chunk_struct = Tree(self._root_label, chunk_struct)
# Use the default trace value?
if trace is None: trace = self._trace
chunkstr = ChunkString(chunk_struct)
# Apply the sequence of rules to the chunkstring.
if trace:
verbose = (trace>1)
self._trace_apply(chunkstr, verbose)
else:
self._notrace_apply(chunkstr)
# Use the chunkstring to create a chunk structure.
return chunkstr.to_chunkstruct(self._chunk_label)