rm_duplicates.py 文件源码-python代码片段

def __init__(self, file_name):
        with open(file_name, encoding='utf-8') as file:
            content = file.read()
        if not any(u in content for u in ('utf-8', 'utf8', 'UTF8', 'UTF-8')):
            raise ValueError("XML file is not encoded in UTF-8. Please recode "
                    "the file or extend this parser and XML writer.")
        tei_start = content.find('<TEI')
        if tei_start < 0:
            raise ValueError("Couldn't find string `<TEI` in the XML file.  Please extend this parser.")
        self.before_root = content[:tei_start]
        content = content[tei_start:]
        tei_end = content.find('</TEI>')
        if tei_end < 0:
            raise ValueError("Couldn't find `</TEI>` in the input file, please extend the parser.")
        tei_end += len('</TEI>')
        self.after_root = content[tei_end:]
        content = content[:tei_end]
        parser = ET.XMLParser(target = CommentedTreeBuilder())
        try:
            parser.feed(content)
        except ET.ParseError as e:
            sys.stderr.write("Error while parsing input file\n")
            sys.stderr.write(str(e).encode(sys.getdefaultencoding()) + '\n')
            sys.exit(15)

        self.root = parser.close()