bin_data.py 文件源码-python代码片段

def bin_data(path, write_path, num_chunks, binning):
    """Bins the continuous features through bucket or quantile binning

    Parameter
    ---------
    path : str
      The path where the dataset to be binned is located.
    write_path : str
      The path where to save the binned dataset.
    num_chunks : int
      The number of file splits to perform on the binned dataset.
    binning : int
      The type of binning to perform on the dataset: 0 if bucket binning, 1 if quantile binning.
    """

    # get the list of files found in PATH
    files = nd.list_files(path=path)

    df = pd.DataFrame()

    for file in files:
        # append the data from CSV files to the dataframe
        df = df.append(pd.read_csv(filepath_or_buffer=file, names=column_names))
        print('appending : {}'.format(file))

    # remove dst_ip_add and src_ip_add features
    df = df.drop(labels=['dst_ip_add', 'src_ip_add'], axis=1)

    for index in range(len(cols_to_std)):
        if int(binning) == 0:
            # bucket binning
            bins = np.linspace(df[cols_to_std[index]].min(), df[cols_to_std[index]].max(), 10)
            df[cols_to_std[index]] = np.digitize(df[cols_to_std[index]], bins, right=True)
            print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))

        if int(binning) == 1:
            # decile binning
            df[cols_to_std[index]] = pd.qcut(df[cols_to_std[index]], 10, labels=False, duplicates='drop')
            print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))

    for id, df_i in enumerate(np.array_split(df, num_chunks)):
        # split and save the dataframe to CSV files
        df_i.to_csv(path_or_buf=os.path.join(write_path, '{id}.csv'.format(id=id)), columns=columns_to_save,
                    header=None, index=False)
        print('Saving CSV file : {path}'.format(path=os.path.join(write_path, '{id}'.format(id=id))))