def discretize(data, vars_to_discretize, n_bins):
'''
Accepts data, a dictionary containing dicretization type for selected variables, and
a dictionary containing the number of bins for selected variables.
Returns data after selected variables have been discretized,
together with binning definition for each variable.
'''
data_subset = ps.DataFrame(data).copy()
bins = {}
for i in vars_to_discretize:
out = None
binning = None
# discretize by splitting into equal intervals
if vars_to_discretize[i] == 'Equal':
out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)
# discretize by frequency
elif vars_to_discretize[i] == 'Freq':
nb = n_bins[i]
while True:
try:
out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
break
except:
nb -= 1
# discretize based on provided bin margins
elif vars_to_discretize[i] == 'Bins':
out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
binning = n_bins[i]
data_subset.ix[:,i] = out
# replace NA variables with and special index (1+max) -
# if it has not been done so automatically an in np.digitize
data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
bins[i] = binning
return data_subset, bins
评论列表
文章目录