def rank_cat(df_tr,ycol,df_te=None,cols=None,rank=True,tag=''):
if cols is None:
cols = [i for i in df_tr.columns.values if df_tr[i].dtype=='object']
if len(cols)==0:
print("no cat cols found")
return
for col in cols:
dic = df_tr.groupby(col)[ycol].mean().to_dict()
if rank:
ks = [i for i in dic]
vs = np.array([dic[i] for i in ks]).argsort().argsort()
dic = {i:j for i,j in zip(ks,vs)}
df_tr[tag+col] = df_tr[col].apply(lambda x: dic[x])
if df_te is not None:
df_te[tag+col] = df_te[col].apply(lambda x: dic.get(x,np.nan))
#overfitting! try LOO!
评论列表
文章目录