def _discretize_by_frequency(col, num_bins, labels):
percent = 1.0/num_bins
bins = sorted(list(set(col.quantile([x*percent for x in range(num_bins+1)]))))
if len(bins)-1 < num_bins:
num_bins = len(bins)-1
print('...Only %d bins (unbalanced) generated due to overlapping percentile boundaries.'%num_bins)
if labels:
if len(labels)!=num_bins:
raise ValueError('Length of assigned labels not consistent with num_bins!')
else:
group_names = labels
else:
group_names = range(num_bins)
return pd.cut(col, bins,labels=group_names, include_lowest=True)
评论列表
文章目录