kdd98_initial_snapshot.py 文件源码

python
阅读 31 收藏 0 点赞 0 评论 0

项目:CustomerSim 作者: sisl 项目源码 文件源码
def discretize(data, vars_to_discretize, n_bins):

    '''
    Accepts data, a dictionary containing dicretization type for selected variables, and 
    a dictionary containing the number of bins for selected variables.

    Returns data after selected variables have been discretized, 
    together with binning definition for each variable.
    '''

    data_subset = ps.DataFrame(data).copy()
    bins = {}
    for i in vars_to_discretize:

        out = None
        binning = None

        # discretize by splitting into equal intervals
        if vars_to_discretize[i] == 'Equal': 
            out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)

        # discretize by frequency
        elif vars_to_discretize[i] == 'Freq':
            nb = n_bins[i]
            while True:
                try:
                    out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
                    break
                except:
                    nb -= 1

        # discretize based on provided bin margins
        elif vars_to_discretize[i] == 'Bins':
            out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
            binning = n_bins[i]

        data_subset.ix[:,i] = out

        # replace NA variables with and special index (1+max) - 
        # if it has not been done so automatically an in np.digitize
        data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
        bins[i] = binning

    return data_subset, bins
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号