google_doc.py 文件源码-python代码片段

def _extract_sections(self):
        """
        Here is an example of what a section header looks like in the
        html of a Google Document:

        <h3 class="c1"><a name="h.699ffpepx6zs"></a><span>Hello World
        </span></h3>

        We split the content of the Google Document up using a regular
        expression that matches the above header. re.split is a pretty
        cool function if you haven't tried it before. It puts the
        matching groups into the list as well as the content between
        the matches. Check it out here:

        http://docs.python.org/library/re.html#re.split

        One big thing we do in this method is replace the ugly section
        id that Google creates with a nicely slugified version of the
        section title. This makes for pretty urls.
        """
        self._sections = []
        header = r'<h(?P<level>\d) class="[^"]+">' \
            r'<a name="(?P<id>[^"]+)"></a>'      \
            r'<span>(?P<title>[^<]+)</span>'     \
            r'</h\d>'
        l = re.split(header, self._content)
        l.pop(0)
        while l:
            section = Section(
                # hack: cause we started with h3 in google docs
                level=int(l.pop(0)) - 2,
                id=l.pop(0),
                title=l.pop(0).decode('utf8'),
                content=l.pop(0),
                )
            section['id'] = slugify(section['title'])
            if section['level'] >= 1:
                self._sections.append(section)