python类qcut()的实例源码-面圈网

_act.py 文件源码项目：skutil 作者: tgsmith61591 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def _compute_stats(self, pred, expo, loss, prem):
        n_samples, n_groups = pred.shape[0], self.n_groups
        pred_ser = pd.Series(pred)
        loss_to_returns = np.sum(loss) / np.sum(prem)

        rank = pd.qcut(pred_ser, n_groups, labels=False)
        n_groups = np.amax(rank) + 1
        groups = np.arange(n_groups)  # if we ever go back to using n_groups...

        tab = pd.DataFrame({
            'rank': rank,
            'pred': pred,
            'prem': prem,
            'loss': loss,
            'expo': expo
        })

        grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank')
        agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns

        return tab, agg_rlr, n_groups

choropleth.py 文件源码项目：clchoropleth 作者: slarrain 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def discretize(data, bins=5, quantile=False):
    '''
    Creates 'bins' number of bins and discretizes the data.
    Uses cut function by default. qcut function otherwise.
    '''
    if quantile:
        new_data = pd.qcut(data, bins, labels=list(range(bins)))
    else:
        new_data = pd.cut(data, bins, labels=list(range(bins)))
    return new_data

plot.py 文件源码项目：urbanaccess 作者: UDST 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def _recursive_category_gen(col, num_bins):
    """
    Generate number of bins recursively

    Parameters
    ----------
    col : string
        the name of the column in the dataframe with the continuous variable
    num_bins : int
        how many quantiles

    Returns
    -------
    num_bins : int
    categories : list
    """

    bin_labels = range(num_bins)

    # base case catch
    if num_bins == 0:
        raise ValueError('Unable to perform qcut to 0 bins.')

    # we assume the num_bins count will work
    try:
        categories = pd.qcut(x=col, q=num_bins, labels=bin_labels)
        return num_bins, categories

    # if it does not, then we need to go down 1 number of bins
    except ValueError:
        new_bin_count = num_bins - 1
        return _recursive_category_gen(col, new_bin_count)

mcs-cluster.py 文件源码项目：pygcam 作者: JGCRI 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def categorizeCI2(inputDF, subsampleFactor=10, title=None):
    #inputDF = normalize(inputDF)
    binLabels = ['Low', 'Medium', 'High']
    indices = range(0, inputDF.shape[0], subsampleFactor)
    plotDF = inputDF.iloc[indices].copy()
    plotDF['bin'] = pd.qcut(inputDF['ci'], len(binLabels), labels=binLabels)
    plotDF.drop(['ci'], axis=1, inplace=True)
    alpha = 0.3
    g = parallel_coordinates(plotDF, 'bin',
                             color=[[0.8,0.0,0.1,alpha],
                                    [0.0,0.8,0.1,alpha],
                                    [0.1,0.1,0.8,alpha],
                                   ])
    plt.xticks(rotation=270)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    if title:
        title += ' (factor=%d)' % subsampleFactor
        g.set_title(title)
    return g

Chapter 03_Logistic Regression vs Random Forest.py 文件源码项目：Statistics-for-Machine-Learning 作者: PacktPublishing 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def IV_calc(data,var):
    if data[var].dtypes == "object":
        dataf = data.groupby([var])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf
    else:
        data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
        dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf

y_transform.py 文件源码项目：autonomio 作者: autonomio 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def y_transform(Y, data, flatten):

    df_y = data[Y]

    # if user input 'int' then function will be "greater than value"
    # if user input 'float' then function will be IQR range

    # below is for case where prediction is true or false
    # but the y-feature is in different format (e.g continuous)

    if flatten == 'mean':
        df_y = pd.DataFrame(df_y >= df_y.mean())
    elif flatten == 'median':
        df_y = pd.DataFrame(df_y >= df_y.median())
    elif flatten == 'mode':
        df_y = pd.DataFrame(df_y >= df_y.mode()[0])
    elif type(flatten) == int:
        df_y = pd.DataFrame(df_y >= flatten)
    elif type(flatten) == float:
        df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))

    # below is for case where the y-feature is converted in
    # to a categorical, either if it's a number or string.

    elif flatten == 'cat_string':
        df_y = pd.Categorical(df_y)
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    elif flatten == 'cat_numeric':
        df_y = pd.qcut(df_y, 5, duplicates='drop')
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    # for cases when y-feature is already in the format
    # where the prediction output will be.

    elif flatten == 'none':
        df_y = pd.DataFrame(df_y)

    return df_y

common.py 文件源码项目：strategy 作者: kanghua309 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def compute(self, today, assets,out,factor,bins):
        out[:] = pd.qcut(factor,bins,labels=False)

returns_quantization.py 文件源码项目：deep-learning-bitcoin 作者: philipperemy 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def add_returns_in_place(df):  # modifies df
    close_prices_returns = compute_returns(df)
    num_bins = 10
    returns_bins = pd.qcut(close_prices_returns, num_bins)
    bins_categories = returns_bins.values.categories
    returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)

    df['close_price_returns'] = close_prices_returns
    df['close_price_returns_bins'] = returns_bins
    df['close_price_returns_labels'] = returns_labels

    return df, bins_categories

test_groupby.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def test_apply_use_categorical_name(self):
        from pandas import qcut
        cats = qcut(self.df.C, 4)

        def get_stats(group):
            return {'min': group.min(),
                    'max': group.max(),
                    'count': group.count(),
                    'mean': group.mean()}

        result = self.df.groupby(cats).D.apply(get_stats)
        self.assertEqual(result.index.names[0], 'C')

plot.py 文件源码项目：osmnx 作者: gboeing 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def get_node_colors_by_attr(G, attr, num_bins=None, cmap='viridis', start=0, stop=1):
    """
    Get a list of node colors by binning some continuous-variable attribute into
    quantiles.

    Parameters
    ----------
    G : networkx multidigraph
    attr : string
        the name of the attribute
    num_bins : int
        how many quantiles (default None assigns each node to its own bin)
    cmap : string
        name of a colormap
    start : float
        where to start in the colorspace
    stop : float
        where to end in the colorspace

    Returns
    -------
    list
    """
    if num_bins is None:
        num_bins=len(G.nodes())
    bin_labels = range(num_bins)
    attr_values = pd.Series([data[attr] for node, data in G.nodes(data=True)])
    cats = pd.qcut(x=attr_values, q=num_bins, labels=bin_labels)
    colors = get_colors(num_bins, cmap, start, stop)
    node_colors = [colors[cat] for cat in cats]
    return node_colors

plot.py 文件源码项目：osmnx 作者: gboeing 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def get_edge_colors_by_attr(G, attr, num_bins=5, cmap='viridis', start=0, stop=1):
    """
    Get a list of edge colors by binning some continuous-variable attribute into
    quantiles.

    Parameters
    ----------
    G : networkx multidigraph
    attr : string
        the name of the continuous-variable attribute
    num_bins : int
        how many quantiles
    cmap : string
        name of a colormap
    start : float
        where to start in the colorspace
    stop : float
        where to end in the colorspace

    Returns
    -------
    list
    """
    if num_bins is None:
        num_bins=len(G.edges())
    bin_labels = range(num_bins)
    attr_values = pd.Series([data[attr] for u, v, key, data in G.edges(keys=True, data=True)])
    cats = pd.qcut(x=attr_values, q=num_bins, labels=bin_labels)
    colors = get_colors(num_bins, cmap, start, stop)
    edge_colors = [colors[cat] for cat in cats]
    return edge_colors

data_manag&visualization.py 文件源码项目：-Python-Analysis_of_wine_quality 作者: ekolik 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def quartileSplit(wine_set):
    print("This is the quartile split of the wines' quality. I-st column contains the intervals of wines' quality;")
    print("II-nd - the number of wine samples with the quality in the corresponding interval.")
    wine_set["quality_quart"] = pd.qcut(wine_set["quality"], 3)
    print(wine_set.groupby("quality_quart").size())

bin_data.py 文件源码项目：gru-svm 作者: AFAgarap 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def bin_data(path, write_path, num_chunks, binning):
    """Bins the continuous features through bucket or quantile binning

    Parameter
    ---------
    path : str
      The path where the dataset to be binned is located.
    write_path : str
      The path where to save the binned dataset.
    num_chunks : int
      The number of file splits to perform on the binned dataset.
    binning : int
      The type of binning to perform on the dataset: 0 if bucket binning, 1 if quantile binning.
    """

    # get the list of files found in PATH
    files = nd.list_files(path=path)

    df = pd.DataFrame()

    for file in files:
        # append the data from CSV files to the dataframe
        df = df.append(pd.read_csv(filepath_or_buffer=file, names=column_names))
        print('appending : {}'.format(file))

    # remove dst_ip_add and src_ip_add features
    df = df.drop(labels=['dst_ip_add', 'src_ip_add'], axis=1)

    for index in range(len(cols_to_std)):
        if int(binning) == 0:
            # bucket binning
            bins = np.linspace(df[cols_to_std[index]].min(), df[cols_to_std[index]].max(), 10)
            df[cols_to_std[index]] = np.digitize(df[cols_to_std[index]], bins, right=True)
            print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))

        if int(binning) == 1:
            # decile binning
            df[cols_to_std[index]] = pd.qcut(df[cols_to_std[index]], 10, labels=False, duplicates='drop')
            print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))

    for id, df_i in enumerate(np.array_split(df, num_chunks)):
        # split and save the dataframe to CSV files
        df_i.to_csv(path_or_buf=os.path.join(write_path, '{id}.csv'.format(id=id)), columns=columns_to_save,
                    header=None, index=False)
        print('Saving CSV file : {path}'.format(path=os.path.join(write_path, '{id}'.format(id=id))))

kdd98_initial_snapshot.py 文件源码项目：CustomerSim 作者: sisl 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def discretize(data, vars_to_discretize, n_bins):

    '''
    Accepts data, a dictionary containing dicretization type for selected variables, and 
    a dictionary containing the number of bins for selected variables.

    Returns data after selected variables have been discretized, 
    together with binning definition for each variable.
    '''

    data_subset = ps.DataFrame(data).copy()
    bins = {}
    for i in vars_to_discretize:

        out = None
        binning = None

        # discretize by splitting into equal intervals
        if vars_to_discretize[i] == 'Equal': 
            out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)

        # discretize by frequency
        elif vars_to_discretize[i] == 'Freq':
            nb = n_bins[i]
            while True:
                try:
                    out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
                    break
                except:
                    nb -= 1

        # discretize based on provided bin margins
        elif vars_to_discretize[i] == 'Bins':
            out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
            binning = n_bins[i]

        data_subset.ix[:,i] = out

        # replace NA variables with and special index (1+max) - 
        # if it has not been done so automatically an in np.digitize
        data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
        bins[i] = binning

    return data_subset, bins

visuals.py 文件源码项目：B-Tax 作者: open-source-economics 项目源码文件源码阅读 51 收藏 0 点赞 0 评论 0

def create_figure(df,x,y,discrete,quantileable,continuous,size,color,controls):
    xs = df[x.value].values
    ys = df[y.value].values

    # x_title = x.value.title()
    # y_title = y.value.title()
    x_title = "Marginal Effective Tax Rate"
    y_title = "Asset Category"

    source = ColumnDataSource(ColumnDataSource.from_df(df))

    kw = dict()
    if x.value in discrete:
        kw['x_range'] = sorted(set(xs))
    if y.value in discrete:
        kw['y_range'] = sorted(set(ys))
    # kw['title'] = "%s vs %s" % (x_title, y_title)
    #kw['title'] = "Marginal Effective Tax Rates on Typically Financed Corporate Investments, 2016 Law"
    # kw['title'] = "Marginal Effective Tax Rates on Corporate Investments, 2016 Law"
    kw['title'] = "METRs on Corporate Investments, 2016 Law"

    p = figure(plot_height=400, plot_width=600, tools='pan,box_zoom,reset,hover', **kw)
    p.xaxis.axis_label = x_title
    p.yaxis.axis_label = y_title

    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [('Asset', '@Asset')]

    if x.value in discrete:
        p.xaxis.major_label_orientation = pd.np.pi / 4

    sz = 9
    if size.value != 'None':
        groups = pd.qcut(df[size.value].values, len(SIZES))
        sz = [SIZES[xx] for xx in groups.codes]

    c = "#73000A"
    if color.value != 'None':
        groups = pd.qcut(df[color.value].values, len(COLORS))
        c = [COLORS[xx] for xx in groups.codes]
    p.circle(x=xs, y=ys, source=source, color=c, size=sz, line_color="white", alpha=0.6, hover_color='white', hover_alpha=0.5)

    # p.title.text_color = "black"
    # p.title.text_font = "Georgia"

    return p

regression_modeling.py 文件源码项目：-Python-Analysis_of_wine_quality 作者: ekolik 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def log_regression(wine_set):
    # # examining the data before recoding
    # print(wine_set["sulphates"].describe())
    # wine_set["sulphates_c"] = pd.qcut(wine_set["sulphates"], 4)
    # print(wine_set.groupby("sulphates_c").size())
    # print()
    # #
    # print(wine_set["alcohol"].describe())
    # wine_set["alcohol_c"] = pd.qcut(wine_set["alcohol"], 4)
    # print(wine_set.groupby("alcohol_c").size())
    # print()
    #
    # print(wine_set["quality"].describe())
    # wine_set["quality_c"] = pd.qcut(wine_set["quality"], 3)
    # print(wine_set.groupby("quality_c").size())
    # print()


    # recode quality into 2 groups: 0:{3,4,5,6}, 1:{7,8,9}
    recode = {3: 0, 4: 0, 5:0, 6:0, 7:1, 8:1, 9:1}
    wine_set['quality_c'] = wine_set['quality'].map(recode)

    # recode sulphates into 2 groups: 0: <= mean, 1: > mean
    def sulphates_to_cat(x):
       if x['sulphates'] <= wine_set['sulphates'].mean():
          return 0
       else:
          return 1
    wine_set['sulphates_c'] = wine_set.apply(lambda x: sulphates_to_cat(x), axis=1)

    # recode alcohol into 2 groups: 0: <= mean , 1: > mean
    def alcohol_to_cat(x):
       if x['alcohol'] <= wine_set['alcohol'].mean():
          return 0
       else:
          return 1
    wine_set['alcohol_c'] = wine_set.apply(lambda x: alcohol_to_cat(x), axis=1)
    # print(wine_set.head(10))

    # logistic regression for sulphates+alcohol -> quality
    print ("Logistic regression model for the association between wine's quality and sulphates&alcohol")
    model1 = smf.logit(formula="quality_c ~ sulphates_c + alcohol_c", data=wine_set)
    results1 = model1.fit()
    print(results1.summary())

    # odds ratios with 95% confidence intervals
    print("\nConfidence intervals")
    conf = results1.conf_int()
    conf['Odds ratio'] = results1.params
    conf.columns = ['Lower conf.int.', 'Upper conf.int.', 'Odds ratio']
    print(numpy.exp(conf))