tokenizer.py 文件源码-python代码片段

def get_section(self, html_file, possible_section_names, possible_next_sections):

        # Open and parse HTML, then extract all textual content from each paragraph 
        h = html.parse(html_file) #, parser=etree.XMLParser(encoding="utf-8"))
        pars = [paragraph.text_content().lower().encode("UTF-8") for paragraph in h.xpath("//p")]   # .encode("utf-8")

        # First we go backwards trying to find the latest occurrence of 
        # one of the possible names of the section of interest 
        begin = None
        for i in reversed(xrange(len(pars))) :
            if match_any(pars[i], possible_section_names) :
                begin = i
                break

        # If the start wasn't found, just halt right away   
        if (begin is None) :
            return ""

        # Otherwise we can look for the end of the section starting from the start
        # of the found section.
        end = None
        for j in xrange(begin+1, len(pars)) :
            if match_any(pars[j], possible_next_sections) :
                end = j
                break

        # End of section not found, so it's not safe to keep this content, 
        # so we return an empty string.
        if (end is None) :
            return ""

        # Otherwise join all paragraphs inside the section found
        return unicode("".join([fix_hyphens(p) for p in pars[begin:end]]), "UTF-8")