def remove_bom(filename):
if os.path.isfile(filename):
f = open(filename, 'rb')
# read first 4 bytes
header = f.read(4)
# check for BOM
bom_len = 0
encodings = [(codecs.BOM_UTF32, 4),
(codecs.BOM_UTF16, 2),
(codecs.BOM_UTF8, 3)]
# remove appropriate number of bytes
for h, l in encodings:
if header.startswith(h):
bom_len = l
break
f.seek(0)
f.read(bom_len)
return f
python类BOM_UTF16的实例源码
def export(self):
out = StringIO()
final = StringIO()
import csv
writer = csv.writer(out, delimiter='\t')
if self.rows:
import codecs
final.write(codecs.BOM_UTF16)
writer.writerow(
[to_unicode(col, "utf8") for col in self.rows.colnames])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
records = self.represented()
for row in records:
writer.writerow(
[str(col).decode('utf8').encode("utf-8") for col in row])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
return str(final.getvalue())
def export(self):
out = cStringIO.StringIO()
final = cStringIO.StringIO()
import csv
writer = csv.writer(out, delimiter='\t')
if self.rows:
import codecs
final.write(codecs.BOM_UTF16)
writer.writerow(
[unicode(col).encode("utf8") for col in self.rows.colnames])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
records = self.represented()
for row in records:
writer.writerow(
[str(col).decode('utf8').encode("utf-8") for col in row])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
return str(final.getvalue())
def export(self):
out = cStringIO.StringIO()
final = cStringIO.StringIO()
import csv
writer = csv.writer(out, delimiter='\t')
if self.rows:
import codecs
final.write(codecs.BOM_UTF16)
writer.writerow(
[unicode(col).encode("utf8") for col in self.rows.colnames])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
records = self.represented()
for row in records:
writer.writerow(
[str(col).decode('utf8').encode("utf-8") for col in row])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
return str(final.getvalue())
def _wmic_output():
"""
Returns the output from running the built-in `wmic` command.
Redirects the output of `wmic` to a temporary file and then reads it back in.
This would be cleaner if done using subprocess, but attempting to capture
`stdout` internally led to freezing under Windows XP. (This may have been
happening because the script is not being run as a main process.)
"""
# choose a unique file name (re-entrant/thread-safe/crash-safe)
OUTPUT_PATH = os.path.join(
tempfile.gettempdir(),
"kolibri_disks-{}.txt".format(uuid.uuid4())
)
# pipe output from the WMIC command to the temp file
cmd = "wmic logicaldisk list full /format:csv > {}".format(OUTPUT_PATH)
returnCode = os.system(cmd)
if returnCode:
raise Exception("Could not run command '{}'".format(cmd))
# output from WMIC is ostensibly UTF-16
with open(OUTPUT_PATH, 'rb') as f:
bin_output = f.read()
# The very first time WMIC is run on a windows machine, the output gets mangled.
# The BOM is replaced by WMIC's initialization message, so we need to put it back.
# (On all subsequent runs, these next lines do nothing.)
INIT_MSG = "Please wait while WMIC is being installed.".encode('ascii') # Yes, ascii.
bin_output = bin_output.replace(INIT_MSG, codecs.BOM_UTF16)
# finally, decode the well-formatted UTF-16 byte string
output = bin_output.decode('utf-16')
# clean up temp file
os.remove(OUTPUT_PATH)
return output
def export(self):
out = cStringIO.StringIO()
final = cStringIO.StringIO()
import csv
writer = csv.writer(out, delimiter='\t')
if self.rows:
import codecs
final.write(codecs.BOM_UTF16)
writer.writerow(
[unicode(col).encode("utf8") for col in self.rows.colnames])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
records = self.represented()
for row in records:
writer.writerow(
[str(col).decode('utf8').encode("utf-8") for col in row])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
return str(final.getvalue())
def decode(str, errors='strict'):
"""
Decode strings
:param str str: input string
:param str errors:error level
:return: str
"""
output = ''
try:
if len(str) < 3:
if codecs.BOM_UTF8.startswith(str):
# not enough data to decide if this is a BOM
# => try again on the next call
output = ""
elif str[:3] == codecs.BOM_UTF8:
(output, sizes) = codecs.utf_8_decode(str[3:], errors)
elif str[:3] == codecs.BOM_UTF16:
output = str[3:].decode('utf16')
else:
# (else) no BOM present
(output, sizes) = codecs.utf_8_decode(str, errors)
return str(output)
except (UnicodeDecodeError, Exception):
# seems, its getting not a content (images, file, etc)
try:
return str.decode('cp1251')
except (UnicodeDecodeError, Exception):
return ""
def export(self):
out = StringIO()
final = StringIO()
import csv
writer = csv.writer(out, delimiter='\t')
if self.rows:
import codecs
final.write(codecs.BOM_UTF16)
writer.writerow(
[to_unicode(col, "utf8") for col in self.rows.colnames])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
records = self.represented()
for row in records:
writer.writerow(
[str(col).decode('utf8').encode("utf-8") for col in row])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
return str(final.getvalue())
def export(self):
out = cStringIO.StringIO()
final = cStringIO.StringIO()
import csv
writer = csv.writer(out, delimiter='\t')
if self.rows:
import codecs
final.write(codecs.BOM_UTF16)
writer.writerow(
[unicode(col).encode("utf8") for col in self.rows.colnames])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
records = self.represented()
for row in records:
writer.writerow(
[str(col).decode('utf8').encode("utf-8") for col in row])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
return str(final.getvalue())
def get_decoded_header(header, value):
subject, encoding = decode_header(value)[0]
subject = subject.strip() # extra whitespace will mess up encoding
if isinstance(subject, bytes):
# Remove Byte Order Mark (BOM) from UTF strings
if encoding == 'utf-8':
return re.sub(codecs.BOM_UTF8, b"", subject).decode(encoding)
if encoding == 'utf-16':
return re.sub(codecs.BOM_UTF16, b"", subject).decode(encoding)
elif encoding == 'utf-32':
return re.sub(codecs.BOM_UTF32, b"", subject).decode(encoding)
# Try various UTF decodings for any unknown 8bit encodings
elif encoding == 'unknown-8bit':
for enc in [('utf-8', codecs.BOM_UTF8),
('utf-32', codecs.BOM_UTF32), # 32 before 16 so it raises errors
('utf-16', codecs.BOM_UTF16)]:
try:
return re.sub(enc[1], b"", subject).decode(enc[0])
except UnicodeDecodeError:
continue
# If none of those encoding work return it in RFC2047 format
return str(subject)
# Provide RFC2047 format string if encoding is a unknown encoding
# Better to have the analyst decode themselves than to provide a mangled string
elif encoding is None:
return str(subject)
else:
return subject.decode(encoding)
def export(self):
out = cStringIO.StringIO()
final = cStringIO.StringIO()
import csv
writer = csv.writer(out, delimiter='\t')
if self.rows:
import codecs
final.write(codecs.BOM_UTF16)
writer.writerow(
[unicode(col).encode("utf8") for col in self.rows.colnames])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
records = self.represented()
for row in records:
writer.writerow(
[str(col).decode('utf8').encode("utf-8") for col in row])
data = out.getvalue().decode("utf8")
data = data.encode("utf-16")
data = data[2:]
final.write(data)
out.truncate(0)
return str(final.getvalue())