index.py 文件源码-python代码片段

def tei_to_chapters(fname):
    """ Convert a TEI 2 xml into an array of chapters with text,
    and return the title. """

    data = codecs.open(fname, 'r', 'utf-8').read().replace('&nbsp', '')

    utf8_parser = etree.XMLParser(encoding='utf-8')
    book = etree.fromstring(data.encode('utf-8'), parser=utf8_parser)

    all_text = u""
    chapters = []
    chap_title = ''
    text = ''
    title = ''

    for item in book.iter():
        if item.tag == 'author':
            author = item.text
        if item.tag == 'title' and not title and \
                item.attrib.get('type') and item.attrib.get('type') == 'main':
            title = item.text

        if item.tag == 'head':
            if item.attrib and item.attrib.get('rend') and \
            item.attrib.get('rend') == 'h2' and not item.text is None:
                chap_title = item.text

        if item.tag == 'head':
            if item.attrib and item.attrib.get('rend') and \
            item.attrib.get('rend') == 'h3' and not item.text is None:
                chap_title += '\n' + item.text

        if item.tag == 'div':
            if item.attrib and item.attrib.get('type') and \
            item.attrib.get('type') == 'chapter':
                all_text += text
                chapters.append([chap_title, text])
                text = ''
                chap_title = ''

        if 'rend' in item.attrib and not item.text is None:
            text += item.text + "\n"
        if item.tag == "p" and not item.text is None:
            text += item.text + "\n"

    chapters.append([chap_title, text])
    return author, title, chapters, all_text