def get_section(html_file, section_name, possible_next_sections):
h = html.parse(html_file)
pars = h.xpath("//p")
begin = end = -1
for i, par in enumerate(pars) :
if (begin>0) and (end>0) :
break
par_text = par.text_content().lower()
if begin<0 and (par_text.find(section_name, 0, 20) >= 0) :
begin = i
if begin>=0 :
for next_section in possible_next_sections :
if (par_text.find(next_section, 0, 20) >= 0) :
end = i
text = ""
if (begin<0) or (end<0) :
raise SectionNotFound("Section %s not found."%section_name)
text = "".join([par.text_content() for par in pars[begin:end]])
return text
评论列表
文章目录