python类cut()的实例源码

plots.py 文件源码 项目:guacml 作者: guacml 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def predictions_vs_actual_classification(model_results, model_name, n_bins, figsize=(7, 3)):
    holdout = model_results.holdout_data
    target = model_results.target
    bins = np.arange(0, 1.001, 1 / n_bins)
    bin_mids = (bins[:-1] + bins[1:]) / 2
    binned = pd.cut(holdout['prediction'], bins=bins)
    bin_counts = holdout.groupby(binned)[target].count()
    bin_means = holdout.groupby(binned)[target].mean()

    fig = plt.figure(figsize=figsize)
    plt.suptitle('{0}: Predictions vs Actual'.format(model_name), fontsize=14)
    ax1 = plt.gca()
    ax1.grid(False)
    ax1.bar(bin_mids, bin_counts, width=1/n_bins, color=sns.light_palette('green')[1],
            label='row count', edgecolor='black')
    ax1.set_xlabel('predicted probability')
    ax1.set_ylabel('row count')

    ax2 = ax1.twinx()
    ax2.plot(bin_mids, bin_means, linewidth=3,
             marker='.', markersize=16, label='actual rate')
    ax2.plot(bins, bins, color=sns.color_palette()[2], label='main diagonal')

    ax2.set_ylabel('actual rate')

    handles, labels = ax1.get_legend_handles_labels()
    handles2, labels2 = ax2.get_legend_handles_labels()
    legend = plt.legend(handles + handles2, labels + labels2,
                        loc='best',
                        frameon=True,
                        framealpha=0.7)
    frame = legend.get_frame()
    frame.set_facecolor('white')
    return fig
recommender_wide_and_deep.py 文件源码 项目:tflearn 作者: tflearn 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def prepare_input_data(self, input_data, name="", category_map=None):
        '''
        Prepare input data dicts
        '''
        print ("-"*40 + " Preparing %s" % name)
        X = input_data[self.continuous_columns].values.astype(np.float32)
        Y = input_data[self.label_column].values.astype(np.float32)
        Y = Y.reshape([-1, 1])
        if self.verbose:
            print ("  Y shape=%s, X shape=%s" % (Y.shape, X.shape))

        X_dict = {"wide_X": X}

        if 'deep' in self.model_type:
            # map categorical value strings to integers
            td = input_data
            if category_map is None:
                category_map = {}
                for cc in self.categorical_columns:
                    if not cc in td.columns:
                        continue
                    cc_values = sorted(td[cc].unique())
                    cc_max = 1+len(cc_values)
                    cc_map = dict(zip(cc_values, range(1, cc_max))) # start from 1 to avoid 0:0 mapping (save 0 for missing)
                    if self.verbose:
                        print ("  category %s max=%s,  map=%s" % (cc, cc_max, cc_map))
                    category_map[cc] = cc_map

            td = td.replace(category_map)

            # bin ages (cuts off extreme values)
            age_bins = [ 0, 12, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 80, 65535 ]
            td['age_binned'] = pd.cut(td['age'], age_bins, labels=False)
            td = td.replace({'age_binned': {np.nan: 0}})
            print ("  %d age bins: age bins = %s" % (len(age_bins), age_bins))

            X_dict.update({ ("%s_in" % cc): td[cc].values.astype(np.int32).reshape([-1, 1]) for cc in self.categorical_columns})

        Y_dict = {"Y": Y}
        if self.verbose:
            print ("-"*40)
        return X_dict, Y_dict, category_map
kdd98_initial_snapshot.py 文件源码 项目:CustomerSim 作者: sisl 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def discretize(data, vars_to_discretize, n_bins):

    '''
    Accepts data, a dictionary containing dicretization type for selected variables, and 
    a dictionary containing the number of bins for selected variables.

    Returns data after selected variables have been discretized, 
    together with binning definition for each variable.
    '''

    data_subset = ps.DataFrame(data).copy()
    bins = {}
    for i in vars_to_discretize:

        out = None
        binning = None

        # discretize by splitting into equal intervals
        if vars_to_discretize[i] == 'Equal': 
            out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)

        # discretize by frequency
        elif vars_to_discretize[i] == 'Freq':
            nb = n_bins[i]
            while True:
                try:
                    out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
                    break
                except:
                    nb -= 1

        # discretize based on provided bin margins
        elif vars_to_discretize[i] == 'Bins':
            out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
            binning = n_bins[i]

        data_subset.ix[:,i] = out

        # replace NA variables with and special index (1+max) - 
        # if it has not been done so automatically an in np.digitize
        data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
        bins[i] = binning

    return data_subset, bins
test_groupby.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_groupby_categorical_two_columns(self):

        # https://github.com/pydata/pandas/issues/8138
        d = {'cat':
             pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
                            ordered=True),
             'ints': [1, 1, 2, 2],
             'val': [10, 20, 30, 40]}
        test = pd.DataFrame(d)

        # Grouping on a single column
        groups_single_key = test.groupby("cat")
        res = groups_single_key.agg('mean')
        exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]},
                        index=pd.CategoricalIndex(["a", "b", "c"], name="cat"))
        tm.assert_frame_equal(res, exp)

        # Grouping on two columns
        groups_double_key = test.groupby(["cat", "ints"])
        res = groups_double_key.agg('mean')
        exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan],
                         "cat": ["a", "a", "b", "b", "c", "c"],
                         "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints"
                                                                 ])
        tm.assert_frame_equal(res, exp)

        # GH 10132
        for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
            c, i = key
            result = groups_double_key.get_group(key)
            expected = test[(test.cat == c) & (test.ints == i)]
            assert_frame_equal(result, expected)

        d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
        test = pd.DataFrame(d)
        values = pd.cut(test['C1'], [1, 2, 3, 6])
        values.name = "cat"
        groups_double_key = test.groupby([values, 'C2'])

        res = groups_double_key.agg('mean')
        nan = np.nan
        idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"],
                                       [1, 2, 3, 4]],
                                      names=["cat", "C2"])
        exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3,
                                nan, nan, nan, nan, 4, 5],
                         "C3": [nan, nan, nan, nan, 10, 100,
                                nan, nan, nan, nan, 200, 34]}, index=idx)
        tm.assert_frame_equal(res, exp)
tf_model.py 文件源码 项目:TensorFlowFlask 作者: PythonWorkshop 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="softmax_model"):
        column_list = training_df.columns.tolist()
        threshold = 5

        red_wine_cleaned = training_df.copy()
        red_wine_cleaned = _outliers(red_wine_cleaned, threshold, column_list[0:-1])

        # Bin the data
        bins = [3, 5, 6, 8]
        red_wine_cleaned['category'] = pd.cut(red_wine_cleaned.quality, bins, labels=['Bad', 'Average', 'Good'],
                                              include_lowest=True)

        # Only include 'Bad' and 'Good' categories
        red_wine_newcats = red_wine_cleaned[red_wine_cleaned['category'].isin(['Bad', 'Good'])].copy()

        bins = [3, 5, 8]
        red_wine_newcats['category'] = pd.cut(red_wine_newcats.quality,
                                              bins, labels=['Bad', 'Good'], include_lowest=True)

        y_red_wine = red_wine_newcats[['category']].get_values()

        # Removing fixed_acidity and quality
        X_red_wine = red_wine_newcats.iloc[:, 1:-2].get_values()

        y_red_wine_raveled = y_red_wine.ravel()
        y_red_wine_integers = [y.replace('Bad', '1') for y in y_red_wine_raveled]
        y_red_wine_integers = [y.replace('Good', '0') for y in y_red_wine_integers]
        y_red_wine_integers = [np.int(y) for y in y_red_wine_integers]

        y_one_hot = _dense_to_one_hot(y_red_wine_integers, num_classes=2)

        X_train, X_test, y_train, y_test = train_test_split(X_red_wine, y_one_hot, test_size=0.2, random_state=42)
        # model

        with tf.variable_scope("softmax_regression"):
            X = tf.placeholder("float", [None, 10])
            y, variables = softmax_regression(X)

        # train
        y_ = tf.placeholder("float", [None, 2])
        cost = -tf.reduce_mean(y_ * tf.log(y))
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        init = tf.initialize_all_variables()
        self.sess.run(init)
        for i in range(100):
            average_cost = 0
            number_of_batches = int(len(X_train) / batch_size)
            for start, end in zip(range(0, len(X_train), batch_size), range(batch_size, len(X_train), batch_size)):
                self.sess.run(optimizer, feed_dict={X: X_train[start:end], y_: y_train[start:end]})
                # Compute average loss
                average_cost += self.sess.run(cost, feed_dict={X: X_train[start:end],
                                              y_: y_train[start:end]}) / number_of_batches
            print(self.sess.run(accuracy, feed_dict={X: X_test, y_: y_test}))

        filename = "data/softmax_regression.ckpt"
        path = self.save_locally(filename)
        self.save_to_s3(path, model_name)
        print("Saved:", path)
plot_yields.py 文件源码 项目:poreduck 作者: alexiswl 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def plot_yield_by_quality():
    # Close any previous plots
    plt.close('all')
    # Read in seqlength and time from ALL_READS dataframe
    new_yield_data = ALL_READS[['time', "seq_length", "av_qual"]]
    # Bin qualities
    qual_bins = [0] + QUALITY_BINS + [new_yield_data["av_qual"].max()]
    # Cut yield data into quality bins
    new_yield_data["descriptive_quality"] = pd.cut(new_yield_data["av_qual"], qual_bins,
                                                   labels=[description
                                                           for description in reversed(QUALITY_DESCRIPTIONS)])
    # Time as index and drop av_qual column
    new_yield_data.set_index(pd.DatetimeIndex(new_yield_data['time']), inplace=True)
    new_yield_data.drop('av_qual', axis=1, inplace=True)
    # Obtain cumulative sum by quality bin in each minute.
    yield_data_grouped = new_yield_data.groupby("descriptive_quality").apply(lambda d: d.resample("1T").sum().fillna(0))['seq_length']
    # Create a dict of dataframes based on groups.
    yield_data_by_quality = {description: yield_data_grouped[description].to_frame().reset_index()
                             for description in
                             QUALITY_DESCRIPTIONS}

    for description, yield_df in yield_data_by_quality.items():
        yield_df.reset_index(inplace=True)
        yield_df.set_index("time", inplace=True)
        yield_df = yield_df.reindex(index=YIELD_DATA.time, fill_value=0)
        yield_df.reset_index(inplace=True)
        # Generate a cumulative sum of sequence data
        yield_df['cumsum_bp'] = yield_df['seq_length'].cumsum()
        # Convert time to timedelta format and then to float format, in hours.
        yield_df['duration_tdelta'] = yield_df['time'].apply(lambda t: t - yield_df['time'].min())
        yield_df['duration_float'] = yield_df['duration_tdelta'].apply(lambda t: t.total_seconds() / 3600)
        yield_data_by_quality[description] = yield_df

    # Set subplots.
    fig, ax = plt.subplots(1)
    # Create ticks using numpy linspace. Ideally will create 6 points between 0 and 48 hours.
    num_points = 7  # Need to include zero point
    x_ticks = np.linspace(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max(), num_points)
    ax.set_xticks(x_ticks)
    # Define axis formatters
    ax.yaxis.set_major_formatter(FuncFormatter(y_yield_to_human_readable))
    ax.xaxis.set_major_formatter(FuncFormatter(x_yield_to_human_readable))
    # Set x and y labels and title.
    ax.set_xlabel("Duration (HH:MM)")
    ax.set_ylabel("Yield")
    ax.set_title(f"Yield for {SAMPLE_NAME} over time by quality")
    ax.stackplot(YIELD_DATA['duration_float'],
                 [yield_data_by_quality[description]['cumsum_bp']
                  for description in QUALITY_DESCRIPTIONS],
                 colors=QUALITY_COLOURS)
    # Limits must be set after the plot is created
    ax.set_xlim(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max())
    ax.set_ylim(ymin=0)

    # Add legend to plot.
    ax.legend([mpatches.Patch(color=colour)
               for colour in QUALITY_COLOURS],
              QUALITY_DESCRIPTIONS, loc=2)
    # Ensure labels are not missed.
    fig.tight_layout()
    savefig(os.path.join(PLOTS_DIR, f"{SAMPLE_NAME.replace(' ', '_')}_yield_plot_by_quality.png"))
pd_pivot.py 文件源码 项目:python_utils 作者: Jayhello 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def titanic_1():
    titanic = sns.load_dataset('titanic')
    print titanic.head()
    #    survived  pclass     sex   age  ......
    #           0       0    male    22
    # 1         1       1  female  38.0
    # 2         1       3  female  26.0
    # 3         1       1  female  35.0
    # 4         0       3    male  35.0

    print titanic.groupby('sex')[['survived']].mean()
    #         survived
    # sex
    # female  0.742038
    # male    0.188908

    print titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
    # class      First    Second     Third
    # sex
    # female  0.968085  0.921053  0.500000
    # male    0.368852  0.157407  0.135447

    print titanic.pivot_table('survived', index='sex', columns='class')
    # class      First    Second     Third
    # sex
    # female  0.968085  0.921053  0.500000
    # male    0.368852  0.157407  0.135447

    age = pd.cut(titanic['age'], [0, 18, 80])
    print titanic.pivot_table('survived', ['sex', age], 'class')
    # class               First    Second     Third
    # sex    age
    # female (0, 18]   0.909091  1.000000  0.511628
    #        (18, 80]  0.972973  0.900000  0.423729
    # male   (0, 18]   0.800000  0.600000  0.215686
    #        (18, 80]  0.375000  0.071429  0.133663

    print titanic.pivot_table(index='sex', columns='class',
                              aggfunc={'survived': sum, 'fare': 'mean'})

    print titanic.pivot_table('survived', index='sex', columns='class', margins=True)
    # class      First    Second     Third       All
    # sex
    # female  0.968085  0.921053  0.500000  0.742038
    # male    0.368852  0.157407  0.135447  0.188908
    # All     0.629630  0.472826  0.242363  0.383838
tflearn_wide_and_deep.py 文件源码 项目:tflearn_wide_and_deep 作者: ichuang 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def prepare_input_data(self, input_data, name="", category_map=None):
        '''
        Prepare input data dicts
        '''
        print ("-"*40 + " Preparing %s" % name)
        X = input_data[self.continuous_columns].values.astype(np.float32)
        Y = input_data[self.label_column].values.astype(np.float32)
        Y = Y.reshape([-1, 1])
        if self.verbose:
            print ("  Y shape=%s, X shape=%s" % (Y.shape, X.shape))

        X_dict = {"wide_X": X}

        if 'deep' in self.model_type:
            # map categorical value strings to integers
            td = input_data
            if category_map is None:
                category_map = {}
                for cc in self.categorical_columns:
                    if not cc in td.columns:
                        continue
                    cc_values = sorted(td[cc].unique())
                    cc_max = 1+len(cc_values)
                    cc_map = dict(zip(cc_values, range(1, cc_max))) # start from 1 to avoid 0:0 mapping (save 0 for missing)
                    if self.verbose:
                        print ("  category %s max=%s,  map=%s" % (cc, cc_max, cc_map))
                    category_map[cc] = cc_map

            td = td.replace(category_map)

            # bin ages (cuts off extreme values)
            age_bins = [ 0, 12, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 80, 65535 ]
            td['age_binned'] = pd.cut(td['age'], age_bins, labels=False)
            td = td.replace({'age_binned': {np.nan: 0}})
            print ("  %d age bins: age bins = %s" % (len(age_bins), age_bins))

            X_dict.update({ ("%s_in" % cc): td[cc].values.astype(np.int32).reshape([-1, 1]) for cc in self.categorical_columns})

        Y_dict = {"Y": Y}
        if self.verbose:
            print ("-"*40)
        return X_dict, Y_dict, category_map


问题


面经


文章

微信
公众号

扫码关注公众号