def custom_builder():
"""
Test parser w. custom builder.
>>> with open(SIMPLE_XMLFILE) as f:
... data = f.read()
>>> class Builder:
... def start(self, tag, attrib):
... print "start", tag
... def end(self, tag):
... print "end", tag
... def data(self, text):
... pass
>>> builder = Builder()
>>> parser = ET.XMLParser(target=builder)
>>> parser.feed(data)
start root
start element
end element
start element
end element
start empty-element
end empty-element
end root
>>> with open(SIMPLE_NS_XMLFILE) as f:
... data = f.read()
>>> class Builder:
... def start(self, tag, attrib):
... print "start", tag
... def end(self, tag):
... print "end", tag
... def data(self, text):
... pass
... def pi(self, target, data):
... print "pi", target, repr(data)
... def comment(self, data):
... print "comment", repr(data)
>>> builder = Builder()
>>> parser = ET.XMLParser(target=builder)
>>> parser.feed(data)
pi pi 'data'
comment ' comment '
start {namespace}root
start {namespace}element
end {namespace}element
start {namespace}element
end {namespace}element
start {namespace}empty-element
end {namespace}empty-element
end {namespace}root
"""
python类XMLParser()的实例源码
def interface():
r"""
Test element tree interface.
>>> element = ET.Element("tag")
>>> check_element(element)
>>> tree = ET.ElementTree(element)
>>> check_element(tree.getroot())
>>> element = ET.Element("t\xe4g", key="value")
>>> tree = ET.ElementTree(element)
>>> repr(element) # doctest: +ELLIPSIS
"<Element 't\\xe4g' at 0x...>"
>>> element = ET.Element("tag", key="value")
Make sure all standard element methods exist.
>>> check_method(element.append)
>>> check_method(element.extend)
>>> check_method(element.insert)
>>> check_method(element.remove)
>>> check_method(element.getchildren)
>>> check_method(element.find)
>>> check_method(element.iterfind)
>>> check_method(element.findall)
>>> check_method(element.findtext)
>>> check_method(element.clear)
>>> check_method(element.get)
>>> check_method(element.set)
>>> check_method(element.keys)
>>> check_method(element.items)
>>> check_method(element.iter)
>>> check_method(element.itertext)
>>> check_method(element.getiterator)
These methods return an iterable. See bug 6472.
>>> check_method(element.iter("tag").next)
>>> check_method(element.iterfind("tag").next)
>>> check_method(element.iterfind("*").next)
>>> check_method(tree.iter("tag").next)
>>> check_method(tree.iterfind("tag").next)
>>> check_method(tree.iterfind("*").next)
These aliases are provided:
>>> assert ET.XML == ET.fromstring
>>> assert ET.PI == ET.ProcessingInstruction
>>> assert ET.XMLParser == ET.XMLTreeBuilder
"""
def parsefile():
"""
Test parsing from file.
>>> tree = ET.parse(SIMPLE_XMLFILE)
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> tree = ET.parse(SIMPLE_NS_XMLFILE)
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
<ns0:root xmlns:ns0="namespace">
<ns0:element key="value">text</ns0:element>
<ns0:element>text</ns0:element>tail
<ns0:empty-element />
</ns0:root>
>>> with open(SIMPLE_XMLFILE) as f:
... data = f.read()
>>> parser = ET.XMLParser()
>>> parser.version # doctest: +ELLIPSIS
'Expat ...'
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> parser = ET.XMLTreeBuilder() # 1.2 compatibility
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> target = ET.TreeBuilder()
>>> parser = ET.XMLParser(target=target)
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
"""
def custom_builder():
"""
Test parser w. custom builder.
>>> with open(SIMPLE_XMLFILE) as f:
... data = f.read()
>>> class Builder:
... def start(self, tag, attrib):
... print "start", tag
... def end(self, tag):
... print "end", tag
... def data(self, text):
... pass
>>> builder = Builder()
>>> parser = ET.XMLParser(target=builder)
>>> parser.feed(data)
start root
start element
end element
start element
end element
start empty-element
end empty-element
end root
>>> with open(SIMPLE_NS_XMLFILE) as f:
... data = f.read()
>>> class Builder:
... def start(self, tag, attrib):
... print "start", tag
... def end(self, tag):
... print "end", tag
... def data(self, text):
... pass
... def pi(self, target, data):
... print "pi", target, repr(data)
... def comment(self, data):
... print "comment", repr(data)
>>> builder = Builder()
>>> parser = ET.XMLParser(target=builder)
>>> parser.feed(data)
pi pi 'data'
comment ' comment '
start {namespace}root
start {namespace}element
end {namespace}element
start {namespace}element
end {namespace}element
start {namespace}empty-element
end {namespace}empty-element
end {namespace}root
"""
def tei_to_chapters(fname):
""" Convert a TEI 2 xml into an array of chapters with text,
and return the title. """
data = codecs.open(fname, 'r', 'utf-8').read().replace(' ', '')
utf8_parser = etree.XMLParser(encoding='utf-8')
book = etree.fromstring(data.encode('utf-8'), parser=utf8_parser)
all_text = u""
chapters = []
chap_title = ''
text = ''
title = ''
for item in book.iter():
if item.tag == 'author':
author = item.text
if item.tag == 'title' and not title and \
item.attrib.get('type') and item.attrib.get('type') == 'main':
title = item.text
if item.tag == 'head':
if item.attrib and item.attrib.get('rend') and \
item.attrib.get('rend') == 'h2' and not item.text is None:
chap_title = item.text
if item.tag == 'head':
if item.attrib and item.attrib.get('rend') and \
item.attrib.get('rend') == 'h3' and not item.text is None:
chap_title += '\n' + item.text
if item.tag == 'div':
if item.attrib and item.attrib.get('type') and \
item.attrib.get('type') == 'chapter':
all_text += text
chapters.append([chap_title, text])
text = ''
chap_title = ''
if 'rend' in item.attrib and not item.text is None:
text += item.text + "\n"
if item.tag == "p" and not item.text is None:
text += item.text + "\n"
chapters.append([chap_title, text])
return author, title, chapters, all_text
def interface():
r"""
Test element tree interface.
>>> element = ET.Element("tag")
>>> check_element(element)
>>> tree = ET.ElementTree(element)
>>> check_element(tree.getroot())
>>> element = ET.Element("t\xe4g", key="value")
>>> tree = ET.ElementTree(element)
>>> repr(element) # doctest: +ELLIPSIS
"<Element 't\\xe4g' at 0x...>"
>>> element = ET.Element("tag", key="value")
Make sure all standard element methods exist.
>>> check_method(element.append)
>>> check_method(element.extend)
>>> check_method(element.insert)
>>> check_method(element.remove)
>>> check_method(element.getchildren)
>>> check_method(element.find)
>>> check_method(element.iterfind)
>>> check_method(element.findall)
>>> check_method(element.findtext)
>>> check_method(element.clear)
>>> check_method(element.get)
>>> check_method(element.set)
>>> check_method(element.keys)
>>> check_method(element.items)
>>> check_method(element.iter)
>>> check_method(element.itertext)
>>> check_method(element.getiterator)
These methods return an iterable. See bug 6472.
>>> check_method(element.iter("tag").next)
>>> check_method(element.iterfind("tag").next)
>>> check_method(element.iterfind("*").next)
>>> check_method(tree.iter("tag").next)
>>> check_method(tree.iterfind("tag").next)
>>> check_method(tree.iterfind("*").next)
These aliases are provided:
>>> assert ET.XML == ET.fromstring
>>> assert ET.PI == ET.ProcessingInstruction
>>> assert ET.XMLParser == ET.XMLTreeBuilder
"""
def parsefile():
"""
Test parsing from file.
>>> tree = ET.parse(SIMPLE_XMLFILE)
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> tree = ET.parse(SIMPLE_NS_XMLFILE)
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
<ns0:root xmlns:ns0="namespace">
<ns0:element key="value">text</ns0:element>
<ns0:element>text</ns0:element>tail
<ns0:empty-element />
</ns0:root>
>>> with open(SIMPLE_XMLFILE) as f:
... data = f.read()
>>> parser = ET.XMLParser()
>>> parser.version # doctest: +ELLIPSIS
'Expat ...'
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> parser = ET.XMLTreeBuilder() # 1.2 compatibility
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> target = ET.TreeBuilder()
>>> parser = ET.XMLParser(target=target)
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
"""
def custom_builder():
"""
Test parser w. custom builder.
>>> with open(SIMPLE_XMLFILE) as f:
... data = f.read()
>>> class Builder:
... def start(self, tag, attrib):
... print "start", tag
... def end(self, tag):
... print "end", tag
... def data(self, text):
... pass
>>> builder = Builder()
>>> parser = ET.XMLParser(target=builder)
>>> parser.feed(data)
start root
start element
end element
start element
end element
start empty-element
end empty-element
end root
>>> with open(SIMPLE_NS_XMLFILE) as f:
... data = f.read()
>>> class Builder:
... def start(self, tag, attrib):
... print "start", tag
... def end(self, tag):
... print "end", tag
... def data(self, text):
... pass
... def pi(self, target, data):
... print "pi", target, repr(data)
... def comment(self, data):
... print "comment", repr(data)
>>> builder = Builder()
>>> parser = ET.XMLParser(target=builder)
>>> parser.feed(data)
pi pi 'data'
comment ' comment '
start {namespace}root
start {namespace}element
end {namespace}element
start {namespace}element
end {namespace}element
start {namespace}empty-element
end {namespace}empty-element
end {namespace}root
"""
def interface():
r"""
Test element tree interface.
>>> element = ET.Element("tag")
>>> check_element(element)
>>> tree = ET.ElementTree(element)
>>> check_element(tree.getroot())
>>> element = ET.Element("t\xe4g", key="value")
>>> tree = ET.ElementTree(element)
>>> repr(element) # doctest: +ELLIPSIS
"<Element 't\\xe4g' at 0x...>"
>>> element = ET.Element("tag", key="value")
Make sure all standard element methods exist.
>>> check_method(element.append)
>>> check_method(element.extend)
>>> check_method(element.insert)
>>> check_method(element.remove)
>>> check_method(element.getchildren)
>>> check_method(element.find)
>>> check_method(element.iterfind)
>>> check_method(element.findall)
>>> check_method(element.findtext)
>>> check_method(element.clear)
>>> check_method(element.get)
>>> check_method(element.set)
>>> check_method(element.keys)
>>> check_method(element.items)
>>> check_method(element.iter)
>>> check_method(element.itertext)
>>> check_method(element.getiterator)
These methods return an iterable. See bug 6472.
>>> check_method(element.iter("tag").next)
>>> check_method(element.iterfind("tag").next)
>>> check_method(element.iterfind("*").next)
>>> check_method(tree.iter("tag").next)
>>> check_method(tree.iterfind("tag").next)
>>> check_method(tree.iterfind("*").next)
These aliases are provided:
>>> assert ET.XML == ET.fromstring
>>> assert ET.PI == ET.ProcessingInstruction
>>> assert ET.XMLParser == ET.XMLTreeBuilder
"""
def parsefile():
"""
Test parsing from file.
>>> tree = ET.parse(SIMPLE_XMLFILE)
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> tree = ET.parse(SIMPLE_NS_XMLFILE)
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
<ns0:root xmlns:ns0="namespace">
<ns0:element key="value">text</ns0:element>
<ns0:element>text</ns0:element>tail
<ns0:empty-element />
</ns0:root>
>>> with open(SIMPLE_XMLFILE) as f:
... data = f.read()
>>> parser = ET.XMLParser()
>>> parser.version # doctest: +ELLIPSIS
'Expat ...'
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> parser = ET.XMLTreeBuilder() # 1.2 compatibility
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
>>> target = ET.TreeBuilder()
>>> parser = ET.XMLParser(target=target)
>>> parser.feed(data)
>>> print serialize(parser.close())
<root>
<element key="value">text</element>
<element>text</element>tail
<empty-element />
</root>
"""
def custom_builder():
"""
Test parser w. custom builder.
>>> with open(SIMPLE_XMLFILE) as f:
... data = f.read()
>>> class Builder:
... def start(self, tag, attrib):
... print "start", tag
... def end(self, tag):
... print "end", tag
... def data(self, text):
... pass
>>> builder = Builder()
>>> parser = ET.XMLParser(target=builder)
>>> parser.feed(data)
start root
start element
end element
start element
end element
start empty-element
end empty-element
end root
>>> with open(SIMPLE_NS_XMLFILE) as f:
... data = f.read()
>>> class Builder:
... def start(self, tag, attrib):
... print "start", tag
... def end(self, tag):
... print "end", tag
... def data(self, text):
... pass
... def pi(self, target, data):
... print "pi", target, repr(data)
... def comment(self, data):
... print "comment", repr(data)
>>> builder = Builder()
>>> parser = ET.XMLParser(target=builder)
>>> parser.feed(data)
pi pi 'data'
comment ' comment '
start {namespace}root
start {namespace}element
end {namespace}element
start {namespace}element
end {namespace}element
start {namespace}empty-element
end {namespace}empty-element
end {namespace}root
"""