python类BOM_UTF32的实例源码-面圈网

def remove_bom(filename): if os.path.isfile(filename): f = open(filename, 'rb') # read first 4 bytes header = f.read(4) # check for BOM bom_len = 0 encodings = [(codecs.BOM_UTF32, 4), (codecs.BOM_UTF16, 2), (codecs.BOM_UTF8, 3)] # remove appropriate number of bytes for h, l in encodings: if header.startswith(h): bom_len = l break f.seek(0) f.read(bom_len) return f

def get_decoded_header(header, value):
    subject, encoding = decode_header(value)[0]
    subject = subject.strip()  # extra whitespace will mess up encoding
    if isinstance(subject, bytes):
        # Remove Byte Order Mark (BOM) from UTF strings
        if encoding == 'utf-8':
            return re.sub(codecs.BOM_UTF8, b"", subject).decode(encoding)
        if encoding == 'utf-16':
            return re.sub(codecs.BOM_UTF16, b"", subject).decode(encoding)
        elif encoding == 'utf-32':
            return re.sub(codecs.BOM_UTF32, b"", subject).decode(encoding)
        # Try various UTF decodings for any unknown 8bit encodings
        elif encoding == 'unknown-8bit':
            for enc in [('utf-8', codecs.BOM_UTF8),
                        ('utf-32', codecs.BOM_UTF32),  # 32 before 16 so it raises errors
                        ('utf-16', codecs.BOM_UTF16)]:
                try:
                    return re.sub(enc[1], b"", subject).decode(enc[0])
                except UnicodeDecodeError:
                    continue
            # If none of those encoding work return it in RFC2047 format
            return str(subject)
        # Provide RFC2047 format string if encoding is a unknown encoding
        # Better to have the analyst decode themselves than to provide a mangled string
        elif encoding is None:
            return str(subject)
        else:
            return subject.decode(encoding)