dataframe_utils.py 文件源码-python代码片段

def read_and_clean_csv_to_dataframe(filename_or_stream, encoding='utf-8'):
    """
    Reads a utf-8 encoded CSV directly into a pandas dataframe as string values and scrubs np.NaN values to Python None

    :param str filename_or_stream: path to CSV
    :return:
    """
    # pulls data in as utf8, all as strings, and without pre whitespace padding
    try:
        data = pd.read_csv(
            filepath_or_buffer=filename_or_stream,
            encoding=encoding,
            dtype=str,
            skipinitialspace=True
        )
    except AttributeError:
        # this is an empty dataframe and pandas crashed because it can't coerce the columns to strings
        # issue and PR to fix is open on pandas core at https://github.com/pydata/pandas/issues/12048
        # slated for 1.8 release
        # so for now just try loading the dataframe without specifying dtype
        data = pd.read_csv(
            filepath_or_buffer=filename_or_stream,
            encoding=encoding,
            skipinitialspace=True
        )
    logging.info('File read via the pandas read_csv methodology.')

    # coerces pandas nulls (of np.NaN type) into python None
    data = data.where((pd.notnull(data)), None)

    # coerces string representations of Python None to a real Python None
    data[data == 'None'] = None
    data[data == ''] = None
    logging.info("Dataframe of shape %s has been retrieved." % str(data.shape))

    return data