def model_data(data, LECAT=False, NAMEAN=False, NA999=False, OH=False, ONLYCONT=False, ONLYCAT=False, ONLYCATOH=False, COLSREMOVAL=False, cols=[], maxCategories=300):
data = data.copy()
cat_var = list(data.select_dtypes(["object"]).columns)
cont_var = list(data.select_dtypes(["float", "int"]).columns)
if COLSREMOVAL:
data = data.drop(cols, 1, inplace=False)
cat_var = list(data.select_dtypes(["object"]).columns)
cont_var = list(data.select_dtypes(["float", "int"]).columns)
if NAMEAN:
for col in cont_var:
data.loc[data[col].isnull(), col] = data[col].mean()
if NA999:
for col in cont_var:
data.loc[data[col].isnull(), col] = -999
if LECAT:
for col in data[cat_var]: data[col] = pd.factorize(data[col])[0]
if OH:
cols2dummy = [col for col in data[cat_var] if len(data[col].unique()) <= maxCategories]
colsNot2dummy = [col for col in data[cat_var] if len(data[col].unique()) > maxCategories]
data = pd.get_dummies(data, dummy_na=True, columns=cols2dummy)
#binning
for col in colsNot2dummy:
data[col] = pd.factorize(data[col])[0]
dcb = DummycolumnsBins(cols=col, prefix=col, nb_bins=2000)
dcb.fit(data)
pd_binned = dcb.transform(data)
data = pd.concat([data,pd_binned],1)
if ONLYCONT:
data = data[cont_var]
if ONLYCAT:
test_idx = data['ID']
Y = data['target']
data = data[cat_var]
data['ID'] = test_idx
data['target'] = Y
if ONLYCATOH:
test_idx = data['ID']
Y = data['target']
cols = list(set(data.columns).difference(set(cont_var))) ; print(cols)
data = data[cols]
data['ID'] = test_idx
data['target'] = Y
return data
评论列表
文章目录