def read_and_clean_csv_to_dataframe(filename_or_stream, encoding='utf-8'):
"""
Reads a utf-8 encoded CSV directly into a pandas dataframe as string values and scrubs np.NaN values to Python None
:param str filename_or_stream: path to CSV
:return:
"""
# pulls data in as utf8, all as strings, and without pre whitespace padding
try:
data = pd.read_csv(
filepath_or_buffer=filename_or_stream,
encoding=encoding,
dtype=str,
skipinitialspace=True
)
except AttributeError:
# this is an empty dataframe and pandas crashed because it can't coerce the columns to strings
# issue and PR to fix is open on pandas core at https://github.com/pydata/pandas/issues/12048
# slated for 1.8 release
# so for now just try loading the dataframe without specifying dtype
data = pd.read_csv(
filepath_or_buffer=filename_or_stream,
encoding=encoding,
skipinitialspace=True
)
logging.info('File read via the pandas read_csv methodology.')
# coerces pandas nulls (of np.NaN type) into python None
data = data.where((pd.notnull(data)), None)
# coerces string representations of Python None to a real Python None
data[data == 'None'] = None
data[data == ''] = None
logging.info("Dataframe of shape %s has been retrieved." % str(data.shape))
return data
评论列表
文章目录