def __init__(self, chunk_struct, debug_level=1):
"""
Construct a new ``ChunkString`` that encodes the chunking of
the text ``tagged_tokens``.
:type chunk_struct: Tree
:param chunk_struct: The chunk structure to be further chunked.
:type debug_level: int
:param debug_level: The level of debugging which should be
applied to transformations on the ``ChunkString``. The
valid levels are:
- 0: no checks
- 1: full check on to_chunkstruct
- 2: full check on to_chunkstruct and cursory check after
each transformation.
- 3: full check on to_chunkstruct and full check after
each transformation.
We recommend you use at least level 1. You should
probably use level 3 if you use any non-standard
subclasses of ``RegexpChunkRule``.
"""
self._root_label = chunk_struct.label()
self._pieces = chunk_struct[:]
tags = [self._tag(tok) for tok in self._pieces]
self._str = '<' + '><'.join(tags) + '>'
self._debug = debug_level
python类Tree()的实例源码
regexp.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
regexp.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def _tag(self, tok):
if isinstance(tok, tuple):
return tok[1]
elif isinstance(tok, Tree):
return tok.label()
else:
raise ValueError('chunk structures must contain tagged '
'tokens or trees')
regexp.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def to_chunkstruct(self, chunk_label='CHUNK'):
"""
Return the chunk structure encoded by this ``ChunkString``.
:rtype: Tree
:raise ValueError: If a transformation has generated an
invalid chunkstring.
"""
if self._debug > 0: self._verify(self._str, 1)
# Use this alternating list to create the chunkstruct.
pieces = []
index = 0
piece_in_chunk = 0
for piece in re.split('[{}]', self._str):
# Find the list of tokens contained in this piece.
length = piece.count('<')
subsequence = self._pieces[index:index+length]
# Add this list of tokens to our pieces.
if piece_in_chunk:
pieces.append(Tree(chunk_label, subsequence))
else:
pieces += subsequence
# Update index, piece_in_chunk
index += length
piece_in_chunk = not piece_in_chunk
return Tree(self._root_label, pieces)
regexp.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def parse(self, chunk_struct, trace=None):
"""
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:rtype: Tree
:return: a chunk structure that encodes the chunks in a given
tagged sentence. A chunk is a non-overlapping linguistic
group, such as a noun phrase. The set of chunks
identified in the chunk structure depends on the rules
used to define this ``RegexpChunkParser``.
"""
if len(chunk_struct) == 0:
print('Warning: parsing empty text')
return Tree(self._root_label, [])
try:
chunk_struct.label()
except AttributeError:
chunk_struct = Tree(self._root_label, chunk_struct)
# Use the default trace value?
if trace is None: trace = self._trace
chunkstr = ChunkString(chunk_struct)
# Apply the sequence of rules to the chunkstring.
if trace:
verbose = (trace>1)
self._trace_apply(chunkstr, verbose)
else:
self._notrace_apply(chunkstr)
# Use the chunkstring to create a chunk structure.
return chunkstr.to_chunkstruct(self._chunk_label)
def __init__(self, chunk_struct, debug_level=1):
"""
Construct a new ``ChunkString`` that encodes the chunking of
the text ``tagged_tokens``.
:type chunk_struct: Tree
:param chunk_struct: The chunk structure to be further chunked.
:type debug_level: int
:param debug_level: The level of debugging which should be
applied to transformations on the ``ChunkString``. The
valid levels are:
- 0: no checks
- 1: full check on to_chunkstruct
- 2: full check on to_chunkstruct and cursory check after
each transformation.
- 3: full check on to_chunkstruct and full check after
each transformation.
We recommend you use at least level 1. You should
probably use level 3 if you use any non-standard
subclasses of ``RegexpChunkRule``.
"""
self._root_label = chunk_struct.label()
self._pieces = chunk_struct[:]
tags = [self._tag(tok) for tok in self._pieces]
self._str = '<' + '><'.join(tags) + '>'
self._debug = debug_level
def _tag(self, tok):
if isinstance(tok, tuple):
return tok[1]
elif isinstance(tok, Tree):
return tok.label()
else:
raise ValueError('chunk structures must contain tagged '
'tokens or trees')
def to_chunkstruct(self, chunk_label='CHUNK'):
"""
Return the chunk structure encoded by this ``ChunkString``.
:rtype: Tree
:raise ValueError: If a transformation has generated an
invalid chunkstring.
"""
if self._debug > 0: self._verify(self._str, 1)
# Use this alternating list to create the chunkstruct.
pieces = []
index = 0
piece_in_chunk = 0
for piece in re.split('[{}]', self._str):
# Find the list of tokens contained in this piece.
length = piece.count('<')
subsequence = self._pieces[index:index+length]
# Add this list of tokens to our pieces.
if piece_in_chunk:
pieces.append(Tree(chunk_label, subsequence))
else:
pieces += subsequence
# Update index, piece_in_chunk
index += length
piece_in_chunk = not piece_in_chunk
return Tree(self._root_label, pieces)
def parse(self, chunk_struct, trace=None):
"""
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:rtype: Tree
:return: a chunk structure that encodes the chunks in a given
tagged sentence. A chunk is a non-overlapping linguistic
group, such as a noun phrase. The set of chunks
identified in the chunk structure depends on the rules
used to define this ``RegexpChunkParser``.
"""
if len(chunk_struct) == 0:
print('Warning: parsing empty text')
return Tree(self._root_label, [])
try:
chunk_struct.label()
except AttributeError:
chunk_struct = Tree(self._root_label, chunk_struct)
# Use the default trace value?
if trace is None: trace = self._trace
chunkstr = ChunkString(chunk_struct)
# Apply the sequence of rules to the chunkstring.
if trace:
verbose = (trace>1)
self._trace_apply(chunkstr, verbose)
else:
self._notrace_apply(chunkstr)
# Use the chunkstring to create a chunk structure.
return chunkstr.to_chunkstruct(self._chunk_label)
def to_nltk_tree(node):
"""Creates a fixed representation of a Spacy dependency tree as a NLTK tree. This fixed representation
will be formed by the Spacy's node attributes: dep_, orth_ and pos_.
Args:
node: The starting node from the tree in which the transformation will occur.
Returns:
A NLTK Tree (nltk.tree)
"""
if node.n_lefts + node.n_rights > 0:
return Tree(node.dep_+"/"+node.orth_+"/"+node.pos_, [to_nltk_tree(child) for child in node.children])
else:
return node.dep_+"/"+node.orth_+"/"+node.pos_
def isLegalTree(line, i):
try:
t = nltk.Tree(line)
pt = nltk.ParentedTree(line)
except ValueError:
print >> sys.stderr, "illegal tree!!! #" + str(i)
print >> sys.stderr, line
exit(1)
def binarize(line, lan = "en"):
assert lan in ['en', 'ch'], "illegal language (en or ch): %s" % lan
root = nltk.Tree(line)
stack = [root]
while stack:
curNode = stack.pop()
if len(curNode) > 2:
if curNode.node == 'NP':
rightBinarize(curNode)
elif curNode.node == 'VP':
if lan == 'en':
vvBinarize(curNode)
elif lan == 'ch':
if curNode[0].node in vvTags:
leftBinarize(curNode)
elif curNode[-1].node in vvTags:
rightBinarize(curNode)
else:
vvBinarize(curNode)
for child in curNode:
#print >> sys.stderr, child
if child.height() > 2:
stack.append(child)
continue
return ' '.join(root.pprint().split()) + '\n'
def __init__(self, chunk_struct, debug_level=1):
"""
Construct a new ``ChunkString`` that encodes the chunking of
the text ``tagged_tokens``.
:type chunk_struct: Tree
:param chunk_struct: The chunk structure to be further chunked.
:type debug_level: int
:param debug_level: The level of debugging which should be
applied to transformations on the ``ChunkString``. The
valid levels are:
- 0: no checks
- 1: full check on to_chunkstruct
- 2: full check on to_chunkstruct and cursory check after
each transformation.
- 3: full check on to_chunkstruct and full check after
each transformation.
We recommend you use at least level 1. You should
probably use level 3 if you use any non-standard
subclasses of ``RegexpChunkRule``.
"""
self._root_label = chunk_struct.label()
self._pieces = chunk_struct[:]
tags = [self._tag(tok) for tok in self._pieces]
self._str = '<' + '><'.join(tags) + '>'
self._debug = debug_level
def _tag(self, tok):
if isinstance(tok, tuple):
return tok[1]
elif isinstance(tok, Tree):
return tok.label()
else:
raise ValueError('chunk structures must contain tagged '
'tokens or trees')
def to_chunkstruct(self, chunk_label='CHUNK'):
"""
Return the chunk structure encoded by this ``ChunkString``.
:rtype: Tree
:raise ValueError: If a transformation has generated an
invalid chunkstring.
"""
if self._debug > 0: self._verify(self._str, 1)
# Use this alternating list to create the chunkstruct.
pieces = []
index = 0
piece_in_chunk = 0
for piece in re.split('[{}]', self._str):
# Find the list of tokens contained in this piece.
length = piece.count('<')
subsequence = self._pieces[index:index+length]
# Add this list of tokens to our pieces.
if piece_in_chunk:
pieces.append(Tree(chunk_label, subsequence))
else:
pieces += subsequence
# Update index, piece_in_chunk
index += length
piece_in_chunk = not piece_in_chunk
return Tree(self._root_label, pieces)
def parse(self, chunk_struct, trace=None):
"""
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:rtype: Tree
:return: a chunk structure that encodes the chunks in a given
tagged sentence. A chunk is a non-overlapping linguistic
group, such as a noun phrase. The set of chunks
identified in the chunk structure depends on the rules
used to define this ``RegexpChunkParser``.
"""
if len(chunk_struct) == 0:
print('Warning: parsing empty text')
return Tree(self._root_label, [])
try:
chunk_struct.label()
except AttributeError:
chunk_struct = Tree(self._root_label, chunk_struct)
# Use the default trace value?
if trace is None: trace = self._trace
chunkstr = ChunkString(chunk_struct)
# Apply the sequence of rules to the chunkstring.
if trace:
verbose = (trace>1)
self._trace_apply(chunkstr, verbose)
else:
self._notrace_apply(chunkstr)
# Use the chunkstring to create a chunk structure.
return chunkstr.to_chunkstruct(self._chunk_label)
def _tag(self, tok):
if isinstance(tok, tuple):
return tok[1]
elif isinstance(tok, Tree):
return tok.label()
else:
raise ValueError('chunk structures must contain tagged '
'tokens or trees')
def to_chunkstruct(self, chunk_label='CHUNK'):
"""
Return the chunk structure encoded by this ``ChunkString``.
:rtype: Tree
:raise ValueError: If a transformation has generated an
invalid chunkstring.
"""
if self._debug > 0: self._verify(self._str, 1)
# Use this alternating list to create the chunkstruct.
pieces = []
index = 0
piece_in_chunk = 0
for piece in re.split('[{}]', self._str):
# Find the list of tokens contained in this piece.
length = piece.count('<')
subsequence = self._pieces[index:index+length]
# Add this list of tokens to our pieces.
if piece_in_chunk:
pieces.append(Tree(chunk_label, subsequence))
else:
pieces += subsequence
# Update index, piece_in_chunk
index += length
piece_in_chunk = not piece_in_chunk
return Tree(self._root_label, pieces)
def parse(self, chunk_struct, trace=None):
"""
:type chunk_struct: Tree
:param chunk_struct: the chunk structure to be (further) chunked
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
:rtype: Tree
:return: a chunk structure that encodes the chunks in a given
tagged sentence. A chunk is a non-overlapping linguistic
group, such as a noun phrase. The set of chunks
identified in the chunk structure depends on the rules
used to define this ``RegexpChunkParser``.
"""
if len(chunk_struct) == 0:
print('Warning: parsing empty text')
return Tree(self._root_label, [])
try:
chunk_struct.label()
except AttributeError:
chunk_struct = Tree(self._root_label, chunk_struct)
# Use the default trace value?
if trace is None: trace = self._trace
chunkstr = ChunkString(chunk_struct)
# Apply the sequence of rules to the chunkstring.
if trace:
verbose = (trace>1)
self._trace_apply(chunkstr, verbose)
else:
self._notrace_apply(chunkstr)
# Use the chunkstring to create a chunk structure.
return chunkstr.to_chunkstruct(self._chunk_label)
def __init__(self, chunk_struct, debug_level=1):
"""
Construct a new ``ChunkString`` that encodes the chunking of
the text ``tagged_tokens``.
:type chunk_struct: Tree
:param chunk_struct: The chunk structure to be further chunked.
:type debug_level: int
:param debug_level: The level of debugging which should be
applied to transformations on the ``ChunkString``. The
valid levels are:
- 0: no checks
- 1: full check on to_chunkstruct
- 2: full check on to_chunkstruct and cursory check after
each transformation.
- 3: full check on to_chunkstruct and full check after
each transformation.
We recommend you use at least level 1. You should
probably use level 3 if you use any non-standard
subclasses of ``RegexpChunkRule``.
"""
self._root_label = chunk_struct.label()
self._pieces = chunk_struct[:]
tags = [self._tag(tok) for tok in self._pieces]
self._str = '<' + '><'.join(tags) + '>'
self._debug = debug_level
def _tag(self, tok):
if isinstance(tok, tuple):
return tok[1]
elif isinstance(tok, Tree):
return tok.label()
else:
raise ValueError('chunk structures must contain tagged '
'tokens or trees')
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
util.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
relextract.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def isNegationWord(token):
import nltk
if not isinstance(token, nltk.tree.ParentedTree):
print "something went terribly wrong with", token
return None
if (token.label().startswith("V")) or (token.label().startswith("J")):
word = token[0]
if not isinstance(word, unicode):
return False
word = word.lower()
word = _stem_(word)
stemmed_negation_verbs = [_stem_(verb) for verb in NEGATION_VERBS]
return word in stemmed_negation_verbs
word = token[0]
if not isinstance(word, unicode):
return False
word = word.lower()
word = _stem_(word)
return word in NEGATION_ADVERBS
def findSentencePTreeToken(sentence, keyword):
import nltk
from nltk.tree import ParentedTree
stemmed = _lemma_(keyword)
tmp = proc.parse_doc(sentence)
i = 0
numSentences = len(tmp['sentences'])
rs = []
for i in range(0, numSentences):
p = tmp['sentences'][i]['parse']
ptree = ParentedTree.fromstring(p)
# rs = []
for i in range(0, len(ptree.leaves())):
tree_position = ptree.leaf_treeposition(i)
node = ptree[tree_position]
if _stem_(node)==stemmed:
tree_position = tree_position[0:len(tree_position)-1]
rs.append(ptree[tree_position])
# if len(rs)>0:
# return rs
return rs
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################