def _extract_sections(self):
"""
Here is an example of what a section header looks like in the
html of a Google Document:
<h3 class="c1"><a name="h.699ffpepx6zs"></a><span>Hello World
</span></h3>
We split the content of the Google Document up using a regular
expression that matches the above header. re.split is a pretty
cool function if you haven't tried it before. It puts the
matching groups into the list as well as the content between
the matches. Check it out here:
http://docs.python.org/library/re.html#re.split
One big thing we do in this method is replace the ugly section
id that Google creates with a nicely slugified version of the
section title. This makes for pretty urls.
"""
self._sections = []
header = r'<h(?P<level>\d) class="[^"]+">' \
r'<a name="(?P<id>[^"]+)"></a>' \
r'<span>(?P<title>[^<]+)</span>' \
r'</h\d>'
l = re.split(header, self._content)
l.pop(0)
while l:
section = Section(
# hack: cause we started with h3 in google docs
level=int(l.pop(0)) - 2,
id=l.pop(0),
title=l.pop(0).decode('utf8'),
content=l.pop(0),
)
section['id'] = slugify(section['title'])
if section['level'] >= 1:
self._sections.append(section)
评论列表
文章目录