def tei_to_chapters(fname):
""" Convert a TEI 2 xml into an array of chapters with text,
and return the title. """
data = codecs.open(fname, 'r', 'utf-8').read().replace(' ', '')
utf8_parser = etree.XMLParser(encoding='utf-8')
book = etree.fromstring(data.encode('utf-8'), parser=utf8_parser)
all_text = u""
chapters = []
chap_title = ''
text = ''
title = ''
for item in book.iter():
if item.tag == 'author':
author = item.text
if item.tag == 'title' and not title and \
item.attrib.get('type') and item.attrib.get('type') == 'main':
title = item.text
if item.tag == 'head':
if item.attrib and item.attrib.get('rend') and \
item.attrib.get('rend') == 'h2' and not item.text is None:
chap_title = item.text
if item.tag == 'head':
if item.attrib and item.attrib.get('rend') and \
item.attrib.get('rend') == 'h3' and not item.text is None:
chap_title += '\n' + item.text
if item.tag == 'div':
if item.attrib and item.attrib.get('type') and \
item.attrib.get('type') == 'chapter':
all_text += text
chapters.append([chap_title, text])
text = ''
chap_title = ''
if 'rend' in item.attrib and not item.text is None:
text += item.text + "\n"
if item.tag == "p" and not item.text is None:
text += item.text + "\n"
chapters.append([chap_title, text])
return author, title, chapters, all_text
评论列表
文章目录