python类cut()的实例源码

discretization.py 文件源码 项目:ScoreCardModel 作者: data-science-tools 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def transform(self, x):
        """
        Parameters:

            x (Sequence): - ???????

        Returns:

            np.array: - ????????????numpy??

        """
        s = pd.cut(x, bins=self.bins)
        d = pd.get_dummies(s)
        z = d.T.to_dict()
        re = []
        for i, v in z.items():
            for j, u in v.items():
                if u == 1:
                    re.append(str(j))
        return np.array(re)
test_facets.py 文件源码 项目:plotnine 作者: has2k1 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_facet_wrap_expression():
    p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)')
    assert p == 'facet_wrap_expression'
cooccurrences.py 文件源码 项目:visualizations 作者: ContentMine 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def update(attrname, old, new):
    new_selected, new_x_factors, new_y_factors = get_subset(dictionary_selector.value, dictionary_selector.value)
    bins = np.linspace(new_selected.counts.min(), new_selected.counts.max(), 10) # bin labels must be one more than len(colorpalette)
    new_selected["color"] = pd.cut(new_selected.counts, bins, labels = list(reversed(palettes.Blues9)), include_lowest=True)
    new_selected["wikidataID"] = new_selected["x"].map(lambda x: wikidataIDs.get(x))

    fig.xaxis.axis_label = dictionary_selector.value
    fig.yaxis.axis_label = dictionary_selector.value
    fig.title.text = "Top %d fact co-occurrences selected" % top_n.value

    src = ColumnDataSource(dict(
        x=new_selected["x"].astype(object),
        y=new_selected["y"].astype(object),
        color=new_selected["color"].astype(object),
        wikidataID=new_selected["wikidataID"],
        counts=new_selected["counts"].astype(int),
        raw=new_selected["raw"].astype(int)))
    source.data.update(src.data)

    fig.x_range.update(factors=new_x_factors[:top_n.value])
    fig.y_range.update(factors=new_y_factors[:top_n.value])
lexical_analysis.py 文件源码 项目:SFBIStats 作者: royludo 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def plot_tendencies(word_list, pos_dic, bin_size, output_dir, file_name):
    plt.figure()
    dataframe_list = list()
    for word in word_list:
        if word not in pos_dic:
            raise Exception('Word ' + word + ' not found')
        df = pd.DataFrame(pos_dic[word], columns=['pos'])
        df['bins'] = pd.cut(df['pos'], bins=range(0, 100 + bin_size, bin_size), labels=range(0, 100, bin_size))
        df = df.groupby(['bins'])['bins'].count()
        dataframe_list.append(df)

    df_final = pd.DataFrame(pd.concat(dataframe_list, axis=1)).fillna(0)
    df_final.columns = word_list
    ax = df_final.plot()
    ax.set_xlabel("Position (en % de la longueur de la description)")
    ax.set_ylabel("Nombre d'occurrences")
    plt.title('Position des mots dans les descriptions des offres', y=1.08)
    plt.savefig(os.path.join(output_dir, file_name), bbox_inches='tight')
choropleth.py 文件源码 项目:clchoropleth 作者: slarrain 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def discretize(data, bins=5, quantile=False):
    '''
    Creates 'bins' number of bins and discretizes the data.
    Uses cut function by default. qcut function otherwise.
    '''
    if quantile:
        new_data = pd.qcut(data, bins, labels=list(range(bins)))
    else:
        new_data = pd.cut(data, bins, labels=list(range(bins)))
    return new_data
bubble_plot.py 文件源码 项目:bubble_plot 作者: shirmeir 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size=4000, normalization_by_all=False):
    count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
                         pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y], df[z_boolean]], axis=1)
    count_table = count_table.groupby([x,z_boolean])[y].value_counts().unstack().fillna(0)
    count_table = count_table.unstack()
    count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
    z_boolean_values = count_table_long[z_boolean].unique()
    ratio = pd.DataFrame({'ratio':count_table_long.set_index([x,y,z_boolean]).unstack()['value'][z_boolean_values[1]] / (
    count_table_long.set_index([x,y,z_boolean]).unstack()['value'].sum(axis=1) )})
    count_table_long = count_table_long.set_index([x, y ])[['value']].merge(ratio, left_index=True, right_index=True).reset_index()
    size_factor = maximal_bubble_size/count_table_long['value'].max()
    x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
        if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
    y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
        if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
    xticks = np.arange(len(ordered_x_values)) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticks = np.arange(len(ordered_y_values)) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    count_table_long[x] = count_table_long[x].map(x_values_dict)
    count_table_long[y] = count_table_long[y].map(y_values_dict)
    plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
                c=count_table_long['ratio'],  alpha=0.5,
                cmap='cool')
    return count_table_long, xticks, yticks, xticklabels, yticklabels
article_learner.py 文件源码 项目:fake_news 作者: bmassman 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_probabilities(model: ClassifierMixin, X: np.array, y: pd.Series,
                       bins: int = 10, threshold: float = 0.5):
    """Print confusion matrix based on class probability."""
    probs = [p[1] for p in model.predict_proba(X)]
    print('\tProbabilities')
    df = pd.DataFrame({'prob': probs, 'label': y})
    step = 1 / bins
    cut_labels = [round(step * f, 1) for f in range(10)]
    by_prob = (df.groupby(pd.cut(df['prob'], bins, labels=cut_labels))
                 .agg(['sum', 'count'])['label'])
    print('\t\tprobs\t1\t0\tacc')
    for index, row in by_prob.iloc[::-1].iterrows():
        ones = row['sum']
        if math.isnan(ones):
            ones = 0
        else:
            ones = int(ones)
        count = row['count']
        zeros = int(count) - ones
        if count > 0:
            acc = zeros / count if index < threshold else ones / count
        else:
            acc = 0.0
        print(f'\t\t{index}\t{ones}\t{zeros}\t{acc:.3f}')
discretizer.py 文件源码 项目:dsbox-cleaning 作者: usc-isi-i2 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _discretize_by_width(col, num_bins, labels):
    maxvalue = col.max()
    minvalue = col.min()
    width = float((maxvalue-minvalue))/num_bins
    bins = [minvalue + x*width for x in range(num_bins)]+[maxvalue]
    if labels:
        if len(labels)!=num_bins:
            raise ValueError('Length of assigned labels not consistent with num_bins!')
        else:
            group_names = labels
    else:
        group_names = range(num_bins)
    return pd.cut(col, bins,labels=group_names, include_lowest=True)
discretizer.py 文件源码 项目:dsbox-cleaning 作者: usc-isi-i2 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def _discretize_by_frequency(col, num_bins, labels):
    percent = 1.0/num_bins
    bins = sorted(list(set(col.quantile([x*percent for x in range(num_bins+1)]))))
    if len(bins)-1 < num_bins:
        num_bins = len(bins)-1
        print('...Only %d bins (unbalanced) generated due to overlapping percentile boundaries.'%num_bins)
    if labels:
        if len(labels)!=num_bins:
            raise ValueError('Length of assigned labels not consistent with num_bins!')
        else:
            group_names = labels
    else:
        group_names = range(num_bins)
    return pd.cut(col, bins,labels=group_names, include_lowest=True)
stat_summary_bin.py 文件源码 项目:plotnine 作者: has2k1 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def compute_group(cls, data, scales, **params):
        bins = params['bins']
        breaks = params['breaks']
        binwidth = params['binwidth']
        boundary = params['boundary']

        func = make_summary_fun(params['fun_data'], params['fun_y'],
                                params['fun_ymin'], params['fun_ymax'],
                                params['fun_args'])

        breaks = fuzzybreaks(scales.x, breaks, boundary, binwidth, bins)
        data['bin'] = pd.cut(data['x'], bins=breaks, labels=False,
                             include_lowest=True)

        def func_wrapper(data):
            """
            Add `bin` column to each summary result.
            """
            result = func(data)
            result['bin'] = data['bin'].iloc[0]
            return result

        # This is a plyr::ddply
        out = groupby_apply(data, 'bin', func_wrapper)
        centers = (breaks[:-1] + breaks[1:]) * 0.5
        bin_centers = centers[out['bin'].values]
        out['x'] = bin_centers
        out['bin'] += 1
        if isinstance(scales.x, scale_discrete):
            out['width'] = 0.9
        else:
            out['width'] = np.diff(breaks)[bins-1]

        return out
test_facets.py 文件源码 项目:plotnine 作者: has2k1 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_facet_grid_expression():
    p = g + facet_grid(
        ['var2', 'pd.cut(var1, (0, 2, 4), include_lowest=True)'])
    assert p == 'facet_grid_expression'
lexical_analysis.py 文件源码 项目:SFBIStats 作者: royludo 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def plot_tendency(word, pos_dic, bin_size, output_dir, file_name):
    plt.figure()
    if word not in pos_dic:
        raise Exception('Word ' + word + ' notfound')

    df = pd.DataFrame(pos_dic[word], columns=['pos'])  # .groupby(['pos'])['pos'].count()
    df['bins'] = pd.cut(df['pos'], bins=range(0, 100 + bin_size, bin_size), labels=range(0, 100, bin_size))
    df = df.groupby(['bins'])['bins'].count()
    ax = df.plot(title="Position du mot '" + word + "' dans les descriptions des offres")
    ax.set_xlabel("Position (en % de la longueur de la description)")
    ax.set_ylabel("Nombre d'occurrences")
    plt.savefig(os.path.join(output_dir, file_name), bbox_inches='tight')
pysax.py 文件源码 项目:motif-classify 作者: macks22 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def symbolize(self, xs):
        """
        Symbolize a PPA
        """
        alphabet_sz = len(self.alphabet)
        cutpoints = self.cutpoints[alphabet_sz]
        return pd.cut(xs, bins = cutpoints, labels = self.alphabet)
data.py 文件源码 项目:polara 作者: Evfro 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def is_not_uniform(idx, nbins=10, allowed_gap=0.75):
        idx_bins = pd.cut(idx, bins=nbins, labels=False)
        idx_bin_size = np.bincount(idx_bins)

        diff = idx_bin_size[:-1] - idx_bin_size[1:]
        monotonic = (diff < 0).all() or (diff > 0).all()
        huge_gap = (idx_bin_size.min()*1.0 / idx_bin_size.max()) < allowed_gap
        return monotonic or huge_gap
quickmaps.py 文件源码 项目:berrl 作者: murphy214 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def make_object_map(data,field,**kwargs):
    linear = False
    for key,value in kwargs.iteritems():
        if key == 'linear':
            linear = value
    print linear
    if linear == False:
        colors,rangelist = make_distributed_range(data,field)
    else:
        colors = get_heatmap51()
        colors2 = colors 
        maxvalue = data[field].max()
        if maxvalue < 51:
            totallist = range(maxvalue)
            colors = reduce_color_list_size(totallist,colors)
            colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
        else:
            colors = reduce_color_list_size(range(len(data)),colors)
            colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
            if not rangelist[0] == 0:
                rangelist = [0] + rangelist[1:]
            data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors)

            return data
    colors2 = get_heatmap51()
    if not rangelist[0] == 0:
        rangelist = [0] + rangelist[1:]
    data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:])

    return data

# for a given dataframe and field returns a non used grouped object to multiple operations on
quickmaps.py 文件源码 项目:berrl 作者: murphy214 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def make_object_map(data,field,**kwargs):
    linear = False
    for key,value in kwargs.iteritems():
        if key == 'linear':
            linear = value
    print linear
    if linear == False:
        colors,rangelist = make_distributed_range(data,field)
    else:
        colors = get_heatmap51()
        colors2 = colors 
        maxvalue = data[field].max()
        if maxvalue < 51:
            totallist = range(maxvalue)
            colors = reduce_color_list_size(totallist,colors)
            colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
        else:
            colors = reduce_color_list_size(range(len(data)),colors)
            colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
            if not rangelist[0] == 0:
                rangelist = [0] + rangelist[1:]
            data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors)

            return data
    colors2 = get_heatmap51()
    if not rangelist[0] == 0:
        rangelist = [0] + rangelist[1:]
    data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:])

    return data

# for a given dataframe and field returns a non used grouped object to multiple operations on
frisk.py 文件源码 项目:ReducedVarianceReparamGradients 作者: andymiller 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def process_dataset():
    data_dir = os.path.dirname(__file__)
    df = pd.read_csv(os.path.join(data_dir, 'data/frisk/frisk_with_noise.dat'), skiprows=6, delim_whitespace=True)

    # compute proportion black in precinct, black = 1
    # first aggregate by precinct/ethnicity, and sum over populations
    popdf = df[['pop', 'precinct', 'eth']]. \
                groupby(['precinct', 'eth'])['pop'].apply(sum)
    percent_black = np.array([ popdf[i][1] / float(popdf[i].sum())
                               for i in xrange(1, 76)] )
    precinct_type = pd.cut(percent_black, [0, .1, .4, 1.])  #
    df['precinct_type'] = precinct_type.codes[df.precinct.values-1]
    return df
networkclustering.py 文件源码 项目:PyPSA 作者: PyPSA 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def busmap_by_rectangular_grid(buses, divisions=10):
    busmap = pd.Series(0, index=buses.index)
    if isinstance(divisions, tuple):
        divisions_x, divisions_y = divisions
    else:
        divisions_x = divisions_y = divisions
    gb = buses.groupby([pd.cut(buses.x, divisions_x), pd.cut(buses.y, divisions_y)])
    for nk, oks in enumerate(itervalues(gb.groups)):
        busmap.loc[oks] = nk
    return busmap
test_groupby.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_groupby_categorical_unequal_len(self):
        # GH3011
        series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
        # The raises only happens with categorical, not with series of types
        # category
        bins = pd.cut(series.dropna().values, 4)

        # len(bins) != len(series) here
        self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def setUp(self):
        self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c',
                                              'c', 'c'])

        df = DataFrame({'value': np.random.randint(0, 10000, 100)})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False,
                                   labels=labels)
        self.cat = df
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_series_functions_no_warnings(self):
        df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
        labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
        with tm.assert_produces_warning(False):
            df['group'] = pd.cut(df.value, range(0, 105, 10), right=False,
                                 labels=labels)
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def test_assignment_to_dataframe(self):
        # assignment
        df = DataFrame({'value': np.array(
            np.random.randint(0, 10000, 100), dtype='int32')})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]

        df = df.sort_values(by=['value'], ascending=True)
        s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels)
        d = s.values
        df['D'] = d
        str(df)

        result = df.dtypes
        expected = Series(
            [np.dtype('int32'), com.CategoricalDtype()], index=['value', 'D'])
        tm.assert_series_equal(result, expected)

        df['E'] = s
        str(df)

        result = df.dtypes
        expected = Series([np.dtype('int32'), com.CategoricalDtype(),
                           com.CategoricalDtype()],
                          index=['value', 'D', 'E'])
        tm.assert_series_equal(result, expected)

        result1 = df['D']
        result2 = df['E']
        self.assertTrue(result1._data._block.values.equals(d))

        # sorting
        s.name = 'E'
        self.assertTrue(result2.sort_index().equals(s.sort_index()))

        cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
        df = pd.DataFrame(pd.Series(cat))
bubble_plot.py 文件源码 项目:bubble_plot 作者: shirmeir 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def bubble_plot(df, x, y, z_boolean=None, ordered_x_values=None, ordered_y_values=None, bins_x=10,
                bins_y=10, fontsize=16, figsize=(10,5), maximal_bubble_size=4000,
                normalization_by_all = False, log=False):
    """
    :param df: dataframe
    :param x:  name of first numerical/categorical field (string) (for x-axis)
    :param y: name of second numerical/categorical field (string) (for y-axis)
    :param z_boolean: name of categorical field with two categories / boolean field (for coloring)
    :param ordered_x_values: the values we would like to map from x categorical variable 
    according to the order we would like to present them
    :param ordered_y_values: the values we would like to map from the y categorical variable 
    according to the order we would like to present them
    :param bins_x: the bins for x values if x is numberic
    :param bins_y: the bins for y values if y is numberic
    :param normalization_by_all: True - shows joint distribution p(x,y), False - shows conditional distribution p(y|x)
    :param maximal_bubble_size: if the bubbles are too big or too small this is the parameter you should change!
    :param log: whether to apply log on the count (influence the size of the bubbles)
    :return: nice bubble plot, bubble size is propotional to the frequency of the bucket :)
    """
    plt.figure(figsize=figsize)
    x_is_numeric = df[x].dtype in (float, int) and ordered_x_values is None
    y_is_numeric = df[y].dtype in (float, int) and ordered_y_values is None 
    count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
                             pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y]], axis=1)
    count_table = count_table.groupby(x)[y].value_counts().unstack().fillna(0)
    ordered_x_values = count_table.index.values if ordered_x_values is None else ordered_x_values
    ordered_y_values = count_table.columns if ordered_y_values is None else ordered_y_values
    if z_boolean is not None:
        count_table_long, xticks, yticks, xticklabels, yticklabels = plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size, 
                                                                                 normalization_by_all=normalization_by_all)
    else:
        count_table_long, xticks, yticks, xticklabels, yticklabels = plot_without_z(df, x, y, z_boolean, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, 
                                                                                    normalization_by_all=normalization_by_all, log=log, maximal_bubble_size=maximal_bubble_size )
    plt.xticks(xticks, xticklabels,fontsize=fontsize)
    plt.yticks(yticks, yticklabels,fontsize=fontsize)
    plt.xlabel(x, fontsize=fontsize)
    plt.ylabel(y, fontsize=fontsize)
    if z_boolean is None:
        plt.title("{} vs {} ".format(y,x),fontsize=fontsize+4);
    else:
        plt.title("{} vs {} and {} (in colors)".format(y,x, z_boolean),fontsize=fontsize+4);
transform_data.py 文件源码 项目:Mmodel 作者: gxrtbf 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def transform_with_woe(model_data):

    cut_point = model_config.logistic_cut
    for key in cut_point.keys():
        cutss = cut_point[key]['cut_point']
        wwoe = cut_point[key]['woe']
        model_data[key] = pd.cut(model_data[key],bins=cutss,labels=range(len(cutss) - 1)).map(lambda x:wwoe[x])

    return model_data
make_dataset.py 文件源码 项目:snape 作者: mbernico 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def create_categorical_features(df, label_list, random_state=None):
    """
    Creates random categorical variables

    :param df: data frame we're operation on
    :param label_list: A list of lists, each list is the labels for one categorical variable
    :param random_state: the numpy RandomState
    :return: A modified dataframe

    Example:

    create_categorical_features(df, [['a','b'], ['red','blue']])

    """
    random_state = get_random_state(random_state)

    df = df.copy()
    n_categorical = len(label_list)

    # get numeric columns ONCE so we don't have to do it every time we loop:
    numer_cols = [col for col in df.select_dtypes(include=['number']).columns if col != 'y']

    for i in range(0, n_categorical):
        # we might be out of numerical columns!
        if not numer_cols:
            break

        # chose a random numeric column that isn't y
        chosen_col = random_state.choice(numer_cols)
        # pop the chosen_col out of the numer_cols
        numer_cols.pop(numer_cols.index(chosen_col))

        # use cut to convert that column to categorical
        df[chosen_col] = pd.cut(df[chosen_col], bins=len(label_list[i]), labels=label_list[i])

    return df
analysis.py 文件源码 项目:pygcam 作者: JGCRI 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def binColumns(inputDF, bins=DEFAULT_BIN_COUNT):
    columns = inputDF.columns
    binned = pd.DataFrame(columns=columns)
    for col in columns:
        s = inputDF[col]
        binned[col] = pd.cut(s, bins, labels=False)

    return binned

# TBD: Finish refactoring this
data_process.py 文件源码 项目:fx 作者: TaRyu 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
    data = pd.read_pickle(file_in)['close']
    data = data.reshape(-1, 24)
    data = np.array([data[i:i + 24] for i in range(data.shape[0] - 24 + 1)])
    data_s = {
        'open_price': np.array([data[i][0][0]
                                for i in range(data.shape[0] - 1)]),
        'close_price': np.array([data[i][int(NUM_PIX / 24) - 1][23]
                                 for i in range(data.shape[0] - 1)]),
        'max_price': np.array([data[i].max()
                               for i in range(data.shape[0] - 1)]),
        'min_price': np.array([data[i].min()
                               for i in range(data.shape[0] - 1)]),
        'mean_price': np.array([data[i].mean()
                                for i in range(data.shape[0] - 1)]),
        'median_price': np.array([np.median(data[i])
                                  for i in range(data.shape[0] - 1)]),
        'buy_or_sell': np.array(
            [int(data[i + 1][int(NUM_PIX / 24) - 1][23] > data[i + 1][0][0])
             for i in range(data.shape[0] - 1)]),
        'change': np.array(
            [(data[i + 1][int(NUM_PIX / 24) - 1][23] - data[i + 1][0][0]) /
             data[i + 1][int(NUM_PIX / 24) - 1][23] * 100
             for i in range(data.shape[0] - 1)])}
    data_s = pd.DataFrame(data_s)
    bins = [-100, -5, -4, -3, -2, -1.5, -1, -
            0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
    labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
    data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
    bins = [-100, -5, -2, 0, 2, 5, 100]
    labels = [-3, -2, -1, 1, 2, 3]
    data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
    data = data.reshape(len(data), NUM_PIX)
    np.save(file_out[0], data[:len(data) - 1])
    data_s.to_pickle(file_out[1])
app_data.py 文件源码 项目:fx 作者: TaRyu 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
    data = pd.read_pickle(file_in)['close']
    data = np.array([data[i:i + 576] for i in range(data.shape[0] - 576 + 1)])
    data = data.reshape(-1, 576)
    data_s = {
        'open_price': np.array([data[i][0]
                                for i in range(data.shape[0] - 576)]),
        'close_price': np.array([data[i][575]
                                 for i in range(data.shape[0] - 576)]),
        'max_price': np.array([data[i].max()
                               for i in range(data.shape[0] - 576)]),
        'min_price': np.array([data[i].min()
                               for i in range(data.shape[0] - 576)]),
        'mean_price': np.array([data[i].mean()
                                for i in range(data.shape[0] - 576)]),
        'median_price': np.array([np.median(data[i])
                                  for i in range(data.shape[0] - 576)]),
        'buy_or_sell': np.array(
            [int(data[i + 576][575] > data[i + 576][0])
             for i in range(data.shape[0] - 576)]),
        'change': np.array(
            [(data[i + 576][575] - data[i + 576][0]) /
             data[i + 576][575] * 100
             for i in range(data.shape[0] - 576)])}
    data_s = pd.DataFrame(data_s)
    bins = [-100, -5, -4, -3, -2, -1.5, -1, -
            0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
    bins = [0.01 * x for x in bins]
    labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
    data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
    bins = [-100, -5, -2, 0, 2, 5, 100]
    bins = [0.01 * x for x in bins]
    labels = [-3, -2, -1, 1, 2, 3]
    data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
    np.save(file_out[0], data[:len(data) - 576])
    data_s.to_pickle(file_out[1])
new_data_process.py 文件源码 项目:fx 作者: TaRyu 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
    data = pd.read_pickle(file_in)['close']
    data = np.array([data[i:i + 576] for i in range(data.shape[0] - 576 + 1)])
    data = data.reshape(-1, 576)
    data_s = {
        'open_price': np.array([data[i][0]
                                for i in range(data.shape[0] - 576)]),
        'close_price': np.array([data[i][575]
                                 for i in range(data.shape[0] - 576)]),
        'max_price': np.array([data[i].max()
                               for i in range(data.shape[0] - 576)]),
        'min_price': np.array([data[i].min()
                               for i in range(data.shape[0] - 576)]),
        'mean_price': np.array([data[i].mean()
                                for i in range(data.shape[0] - 576)]),
        'median_price': np.array([np.median(data[i])
                                  for i in range(data.shape[0] - 576)]),
        'buy_or_sell': np.array(
            [int(data[i + 576][575] > data[i + 576][0])
             for i in range(data.shape[0] - 576)]),
        'change': np.array(
            [(data[i + 576][575] - data[i + 576][0]) /
             data[i + 576][575] * 100
             for i in range(data.shape[0] - 576)])}
    data_s = pd.DataFrame(data_s)
    bins = [-100, -5, -4, -3, -2, -1.5, -1, -
            0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
    bins = [0.01 * x for x in bins]
    labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
    data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
    bins = [-100, -5, -2, 0, 2, 5, 100]
    bins = [0.01 * x for x in bins]
    labels = [-3, -2, -1, 1, 2, 3]
    data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
    np.save(file_out[0], data[:len(data) - 576])
    data_s.to_pickle(file_out[1])
transformations.py 文件源码 项目:anonymisation 作者: SGMAP-AGD 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def period_by_hours(x, separation):
    ''' aggrege le x par intervale d'heure.
        Le calcul pourrait être simple si on interdisait
        le chevauchement de jour.
    '''
    print(separation)
    assert isinstance(separation, list)
    assert all([sep < 24 for sep in separation])
    separation.sort()

    if 0 in separation:
        separation.append(24)
        hour_categ = pd.cut(x.dt.hour, separation, right=False)
        date_categ = x.dt.date
        return date_categ.astype(str) + ' ' + hour_categ.astype(str)
    else:
        hour = x.dt.hour
        hour_categ = pd.cut(hour, separation, right=False).astype(str)
        night_categ = '[' + str(separation[-1]) + ', ' + str(separation[0]) + ')'
        hour_categ[(hour < separation[0]) | (hour >= separation[-1])] = night_categ
        assert hour_categ.nunique(dropna=False) == len(separation)
        date_categ = x.dt.date.astype(str)
        # décalage d'un jour pour les premières heures
        decale = x.dt.date[x.dt.hour < separation[1]] + pd.DateOffset(days=-1)
        date_categ[x.dt.hour < separation[1]] = decale.astype(str)
        assert all(date_categ.str.len() == 10)
        return date_categ + ' ' + hour_categ


### 4 - special


问题


面经


文章

微信
公众号

扫码关注公众号