def bin_data(path, write_path, num_chunks, binning):
"""Bins the continuous features through bucket or quantile binning
Parameter
---------
path : str
The path where the dataset to be binned is located.
write_path : str
The path where to save the binned dataset.
num_chunks : int
The number of file splits to perform on the binned dataset.
binning : int
The type of binning to perform on the dataset: 0 if bucket binning, 1 if quantile binning.
"""
# get the list of files found in PATH
files = nd.list_files(path=path)
df = pd.DataFrame()
for file in files:
# append the data from CSV files to the dataframe
df = df.append(pd.read_csv(filepath_or_buffer=file, names=column_names))
print('appending : {}'.format(file))
# remove dst_ip_add and src_ip_add features
df = df.drop(labels=['dst_ip_add', 'src_ip_add'], axis=1)
for index in range(len(cols_to_std)):
if int(binning) == 0:
# bucket binning
bins = np.linspace(df[cols_to_std[index]].min(), df[cols_to_std[index]].max(), 10)
df[cols_to_std[index]] = np.digitize(df[cols_to_std[index]], bins, right=True)
print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))
if int(binning) == 1:
# decile binning
df[cols_to_std[index]] = pd.qcut(df[cols_to_std[index]], 10, labels=False, duplicates='drop')
print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))
for id, df_i in enumerate(np.array_split(df, num_chunks)):
# split and save the dataframe to CSV files
df_i.to_csv(path_or_buf=os.path.join(write_path, '{id}.csv'.format(id=id)), columns=columns_to_save,
header=None, index=False)
print('Saving CSV file : {path}'.format(path=os.path.join(write_path, '{id}'.format(id=id))))
评论列表
文章目录