def get_section(self, html_file, possible_section_names, possible_next_sections):
# Open and parse HTML, then extract all textual content from each paragraph
h = html.parse(html_file) #, parser=etree.XMLParser(encoding="utf-8"))
pars = [paragraph.text_content().lower().encode("UTF-8") for paragraph in h.xpath("//p")] # .encode("utf-8")
# First we go backwards trying to find the latest occurrence of
# one of the possible names of the section of interest
begin = None
for i in reversed(xrange(len(pars))) :
if match_any(pars[i], possible_section_names) :
begin = i
break
# If the start wasn't found, just halt right away
if (begin is None) :
return ""
# Otherwise we can look for the end of the section starting from the start
# of the found section.
end = None
for j in xrange(begin+1, len(pars)) :
if match_any(pars[j], possible_next_sections) :
end = j
break
# End of section not found, so it's not safe to keep this content,
# so we return an empty string.
if (end is None) :
return ""
# Otherwise join all paragraphs inside the section found
return unicode("".join([fix_hyphens(p) for p in pars[begin:end]]), "UTF-8")
评论列表
文章目录