def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
python类Tree()的实例源码
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def trees2label_sents(trees,only_root=False,pos_neg_label=False,remove_double_count_sentence=False):
#print 'trees2label_sents',flag_word_lower,flag_stemmer,flag_remove_double_count_sentence,only_root,pos_neg_label
#sys.exit()
lss=[]
for tree in trees:
lss_tmp=tree2label_sent(tree)
if pos_neg_label and lss_tmp[0][0] == 2 :
continue
if pos_neg_label :
lss_tmp2 = [ [1 if ls[0] > 2 else 0 ,ls[1]] for ls in lss_tmp]
else:
lss_tmp2 =lss_tmp
if len(lss_tmp2) > 0 and only_root:
lss.append(lss_tmp2[0])
elif len(lss_tmp2) > 0:
lss.extend(lss_tmp2)
if remove_double_count_sentence :
uss=label_sents2uni_sent(lss)
lss_new =[[np.mean([lss[id][0] for id in uss[s]]),lss[uss[s][0]][1] ] for s in uss ]
return lss_new
else:
return lss
def tree(self):
"""
Starting with the ``root`` node, build a dependency tree using the NLTK
``Tree`` constructor. Dependency labels are omitted.
"""
node = self.root
word = node['word']
deps = chain.from_iterable(node['deps'].values())
return Tree(word, [self._tree(dep) for dep in deps])
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
root_label='S', strict=False):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for (word, postag, chunktag) in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word,postag))
elif chunktag.startswith('B-'):
tree.append(Tree(chunktag[2:], [(word,postag)]))
elif chunktag.startswith('I-'):
if (len(tree)==0 or not isinstance(tree[-1], Tree) or
tree[-1].label() != chunktag[2:]):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word,postag)]))
else:
tree[-1].append((word,postag))
elif chunktag == 'O':
tree.append((word,postag))
else:
raise ValueError("Bad conll tag %r" % chunktag)
return tree
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError('expected child to be Tree or tuple')
return tree
def tree2semi_rel(tree):
"""
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
identifies pairs whose first member is a list (possibly empty) of terminal
strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
:param tree: a chunk tree
:return: a list of pairs (list(str), ``Tree``)
:rtype: list of tuple
"""
from nltk.tree import Tree
semi_rels = []
semi_rel = [[], None]
for dtr in tree:
if not isinstance(dtr, Tree):
semi_rel[0].append(dtr)
else:
# dtr is a Tree
semi_rel[1] = dtr
semi_rels.append(semi_rel)
semi_rel = [[], None]
return semi_rels
util.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
root_label='S', strict=False):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for (word, postag, chunktag) in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word,postag))
elif chunktag.startswith('B-'):
tree.append(Tree(chunktag[2:], [(word,postag)]))
elif chunktag.startswith('I-'):
if (len(tree)==0 or not isinstance(tree[-1], Tree) or
tree[-1].label() != chunktag[2:]):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word,postag)]))
else:
tree[-1].append((word,postag))
elif chunktag == 'O':
tree.append((word,postag))
else:
raise ValueError("Bad conll tag %r" % chunktag)
return tree
chunked.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError('expected child to be Tree or tuple')
return tree
relextract.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def tree2semi_rel(tree):
"""
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
identifies pairs whose first member is a list (possibly empty) of terminal
strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
:param tree: a chunk tree
:return: a list of pairs (list(str), ``Tree``)
:rtype: list of tuple
"""
from nltk.tree import Tree
semi_rels = []
semi_rel = [[], None]
for dtr in tree:
if not isinstance(dtr, Tree):
semi_rel[0].append(dtr)
else:
# dtr is a Tree
semi_rel[1] = dtr
semi_rels.append(semi_rel)
semi_rel = [[], None]
return semi_rels
def getLeaves(ptree):
import nltk
rs = []
if isinstance(ptree, nltk.tree.ParentedTree):
if len(ptree)>0:
if isinstance(ptree[0], unicode):
rs.append(ptree)
for node in ptree:
if isinstance(node, nltk.tree.ParentedTree):
if len(node)>0:
if isinstance(node[0], unicode):
rs.append(node)
else:
rs.extend(getLeaves(node))
return rs
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
root_label='S', strict=False):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for (word, postag, chunktag) in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word,postag))
elif chunktag.startswith('B-'):
tree.append(Tree(chunktag[2:], [(word,postag)]))
elif chunktag.startswith('I-'):
if (len(tree)==0 or not isinstance(tree[-1], Tree) or
tree[-1].label() != chunktag[2:]):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word,postag)]))
else:
tree[-1].append((word,postag))
elif chunktag == 'O':
tree.append((word,postag))
else:
raise ValueError("Bad conll tag %r" % chunktag)
return tree
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError('expected child to be Tree or tuple')
return tree
def tree2semi_rel(tree):
"""
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
identifies pairs whose first member is a list (possibly empty) of terminal
strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
:param tree: a chunk tree
:return: a list of pairs (list(str), ``Tree``)
:rtype: list of tuple
"""
from nltk.tree import Tree
semi_rels = []
semi_rel = [[], None]
for dtr in tree:
if not isinstance(dtr, Tree):
semi_rel[0].append(dtr)
else:
# dtr is a Tree
semi_rel[1] = dtr
semi_rels.append(semi_rel)
semi_rel = [[], None]
return semi_rels
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
root_label='S', strict=False):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for (word, postag, chunktag) in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word,postag))
elif chunktag.startswith('B-'):
tree.append(Tree(chunktag[2:], [(word,postag)]))
elif chunktag.startswith('I-'):
if (len(tree)==0 or not isinstance(tree[-1], Tree) or
tree[-1].label() != chunktag[2:]):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word,postag)]))
else:
tree[-1].append((word,postag))
elif chunktag == 'O':
tree.append((word,postag))
else:
raise ValueError("Bad conll tag {0!r}".format(chunktag))
return tree
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError('expected child to be Tree or tuple')
return tree
def tree2semi_rel(tree):
"""
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
identifies pairs whose first member is a list (possibly empty) of terminal
strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
:param tree: a chunk tree
:return: a list of pairs (list(str), ``Tree``)
:rtype: list of tuple
"""
from nltk.tree import Tree
semi_rels = []
semi_rel = [[], None]
for dtr in tree:
if not isinstance(dtr, Tree):
semi_rel[0].append(dtr)
else:
# dtr is a Tree
semi_rel[1] = dtr
semi_rels.append(semi_rel)
semi_rel = [[], None]
return semi_rels
AIDA_Check_and_Rewrite.py 文件源码
项目:Extracting-Core-Claims
作者: TomJansen25
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def check_if_atomic(sentence, parsed_sentence, tags):
counter = 0
atomic_check = re.compile("|".join(not_atomic_list))
tree = Tree('s', parsed_sentence)
for child in tree:
string = str(child)
if string.startswith("(S"):
counter += 1
if atomic_check.search(sentence_lower):
return False
elif counter > 1:
return False
else:
return True
def trainSVMTK(docs, pairs, dditype, model="svm_tk_classifier.model", excludesentences=[]):
if os.path.isfile("ddi_models/" + model):
os.remove("ddi_models/" + model)
if os.path.isfile("ddi_models/" + model + ".txt"):
os.remove("ddi_models/" + model + ".txt")
#docs = use_external_data(docs, excludesentences, dditype)
xerrors = 0
with open("ddi_models/" + model + ".txt", 'w') as train:
#print pairs
for p in pairs:
if dditype != "all" and pairs[p][relations.PAIR_DDI] and pairs[p][relations.PAIR_TYPE] != dditype:
continue
sid = relations.getSentenceID(p)
if sid not in excludesentences:
tree = pairs[p][relations.PAIR_DEP_TREE][:]
#print "tree1:", tree
#if len(docs[sid][ddi.SENTENCE_ENTITIES]) > 20:
#print line
# line = "1 |BT| (ROOT (NP (NN candidatedrug) (, ,) (NN candidatedrug))) |ET|"
# xerrors += 1
#else:
line = get_svm_train_line(tree, pairs[p], sid,
docs[sid][relations.SENTENCE_PAIRS][p])
if not pairs[p][relations.PAIR_DDI]:
line = '-' + line
elif pairs[p][relations.PAIR_TYPE] != dditype and dditype != "all":
line = '-' + line
train.write(line)
#print "tree errors:", xerrors
svmlightcall = Popen(["./svm-light-TK-1.2/svm-light-TK-1.2.1/svm_learn", "-t", "5",
"-L", "0.4", "-T", "2", "-S", "2", "-g", "10",
"-D", "0", "-C", "T", basedir + model + ".txt", basedir + model],
stdout = PIPE, stderr = PIPE)
res = svmlightcall.communicate()
if not os.path.isfile("ddi_models/" + model):
print "failed training model " + basedir + model
print res
sys.exit()
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
root_label='S', strict=False):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for (word, postag, chunktag) in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word,postag))
elif chunktag.startswith('B-'):
tree.append(Tree(chunktag[2:], [(word,postag)]))
elif chunktag.startswith('I-'):
if (len(tree)==0 or not isinstance(tree[-1], Tree) or
tree[-1].label() != chunktag[2:]):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word,postag)]))
else:
tree[-1].append((word,postag))
elif chunktag == 'O':
tree.append((word,postag))
else:
raise ValueError("Bad conll tag %r" % chunktag)
return tree
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError('expected child to be Tree or tuple')
return tree
def tree2semi_rel(tree):
"""
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
identifies pairs whose first member is a list (possibly empty) of terminal
strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
:param tree: a chunk tree
:return: a list of pairs (list(str), ``Tree``)
:rtype: list of tuple
"""
from nltk.tree import Tree
semi_rels = []
semi_rel = [[], None]
for dtr in tree:
if not isinstance(dtr, Tree):
semi_rel[0].append(dtr)
else:
# dtr is a Tree
semi_rel[1] = dtr
semi_rels.append(semi_rel)
semi_rel = [[], None]
return semi_rels
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
root_label='S', strict=False):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for (word, postag, chunktag) in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word,postag))
elif chunktag.startswith('B-'):
tree.append(Tree(chunktag[2:], [(word,postag)]))
elif chunktag.startswith('I-'):
if (len(tree)==0 or not isinstance(tree[-1], Tree) or
tree[-1].label() != chunktag[2:]):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word,postag)]))
else:
tree[-1].append((word,postag))
elif chunktag == 'O':
tree.append((word,postag))
else:
raise ValueError("Bad conll tag %r" % chunktag)
return tree
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError('expected child to be Tree or tuple')
return tree