def factorize(train, test, features, na_value=-9999, full=False, sort=True):
"""Factorize categorical features.
Parameters
----------
train : pd.DataFrame
test : pd.DataFrame
features : list
Column names in the DataFrame to be encoded.
na_value : int, default -9999
full : bool, default False
Whether use all columns from train/test or only from train.
sort : bool, default True
Sort by values.
Returns
-------
train : pd.DataFrame
test : pd.DataFrame
"""
for column in features:
if full:
vs = pd.concat([train[column], test[column]])
labels, indexer = pd.factorize(vs, sort=sort)
else:
labels, indexer = pd.factorize(train[column], sort=sort)
train[column] = indexer.get_indexer(train[column])
test[column] = indexer.get_indexer(test[column])
if na_value != -1:
train[column] = train[column].replace(-1, na_value)
test[column] = test[column].replace(-1, na_value)
return train, test
评论列表
文章目录