def make_drop_duplicate(self, _df_csv_read_ori, _drop_duplicate , _label):
""" Label? ??? ??? ??? ??? ??? Row ??? ????.
Args:
params:
* _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
* _df_csv_read_ori : pandas dataframe
* _label
Returns:
Preprocessing Dataframe
"""
if _drop_duplicate == None or _drop_duplicate == 'null' or _drop_duplicate == False:
logging.info("No Duplicate")
result_df = _df_csv_read_ori
else :
cell_features = _df_csv_read_ori.columns.tolist()
cell_features.remove(_label)
result_df = _df_csv_read_ori.drop_duplicates(cell_features, keep="first")
logging.info("duplicated row delete {0}".format(len(_df_csv_read_ori.index)-len(result_df.index)))
temp_duplicate_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_dup.csvbk"
result_df.to_csv(self.data_src_path + "/backup/" + temp_duplicate_filename)
return result_df
评论列表
文章目录