def predictions_vs_actual_classification(model_results, model_name, n_bins, figsize=(7, 3)):
holdout = model_results.holdout_data
target = model_results.target
bins = np.arange(0, 1.001, 1 / n_bins)
bin_mids = (bins[:-1] + bins[1:]) / 2
binned = pd.cut(holdout['prediction'], bins=bins)
bin_counts = holdout.groupby(binned)[target].count()
bin_means = holdout.groupby(binned)[target].mean()
fig = plt.figure(figsize=figsize)
plt.suptitle('{0}: Predictions vs Actual'.format(model_name), fontsize=14)
ax1 = plt.gca()
ax1.grid(False)
ax1.bar(bin_mids, bin_counts, width=1/n_bins, color=sns.light_palette('green')[1],
label='row count', edgecolor='black')
ax1.set_xlabel('predicted probability')
ax1.set_ylabel('row count')
ax2 = ax1.twinx()
ax2.plot(bin_mids, bin_means, linewidth=3,
marker='.', markersize=16, label='actual rate')
ax2.plot(bins, bins, color=sns.color_palette()[2], label='main diagonal')
ax2.set_ylabel('actual rate')
handles, labels = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
legend = plt.legend(handles + handles2, labels + labels2,
loc='best',
frameon=True,
framealpha=0.7)
frame = legend.get_frame()
frame.set_facecolor('white')
return fig
python类cut()的实例源码
def prepare_input_data(self, input_data, name="", category_map=None):
'''
Prepare input data dicts
'''
print ("-"*40 + " Preparing %s" % name)
X = input_data[self.continuous_columns].values.astype(np.float32)
Y = input_data[self.label_column].values.astype(np.float32)
Y = Y.reshape([-1, 1])
if self.verbose:
print (" Y shape=%s, X shape=%s" % (Y.shape, X.shape))
X_dict = {"wide_X": X}
if 'deep' in self.model_type:
# map categorical value strings to integers
td = input_data
if category_map is None:
category_map = {}
for cc in self.categorical_columns:
if not cc in td.columns:
continue
cc_values = sorted(td[cc].unique())
cc_max = 1+len(cc_values)
cc_map = dict(zip(cc_values, range(1, cc_max))) # start from 1 to avoid 0:0 mapping (save 0 for missing)
if self.verbose:
print (" category %s max=%s, map=%s" % (cc, cc_max, cc_map))
category_map[cc] = cc_map
td = td.replace(category_map)
# bin ages (cuts off extreme values)
age_bins = [ 0, 12, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 80, 65535 ]
td['age_binned'] = pd.cut(td['age'], age_bins, labels=False)
td = td.replace({'age_binned': {np.nan: 0}})
print (" %d age bins: age bins = %s" % (len(age_bins), age_bins))
X_dict.update({ ("%s_in" % cc): td[cc].values.astype(np.int32).reshape([-1, 1]) for cc in self.categorical_columns})
Y_dict = {"Y": Y}
if self.verbose:
print ("-"*40)
return X_dict, Y_dict, category_map
def discretize(data, vars_to_discretize, n_bins):
'''
Accepts data, a dictionary containing dicretization type for selected variables, and
a dictionary containing the number of bins for selected variables.
Returns data after selected variables have been discretized,
together with binning definition for each variable.
'''
data_subset = ps.DataFrame(data).copy()
bins = {}
for i in vars_to_discretize:
out = None
binning = None
# discretize by splitting into equal intervals
if vars_to_discretize[i] == 'Equal':
out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)
# discretize by frequency
elif vars_to_discretize[i] == 'Freq':
nb = n_bins[i]
while True:
try:
out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
break
except:
nb -= 1
# discretize based on provided bin margins
elif vars_to_discretize[i] == 'Bins':
out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
binning = n_bins[i]
data_subset.ix[:,i] = out
# replace NA variables with and special index (1+max) -
# if it has not been done so automatically an in np.digitize
data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
bins[i] = binning
return data_subset, bins
test_groupby.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def test_groupby_categorical_two_columns(self):
# https://github.com/pydata/pandas/issues/8138
d = {'cat':
pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
ordered=True),
'ints': [1, 1, 2, 2],
'val': [10, 20, 30, 40]}
test = pd.DataFrame(d)
# Grouping on a single column
groups_single_key = test.groupby("cat")
res = groups_single_key.agg('mean')
exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]},
index=pd.CategoricalIndex(["a", "b", "c"], name="cat"))
tm.assert_frame_equal(res, exp)
# Grouping on two columns
groups_double_key = test.groupby(["cat", "ints"])
res = groups_double_key.agg('mean')
exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan],
"cat": ["a", "a", "b", "b", "c", "c"],
"ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints"
])
tm.assert_frame_equal(res, exp)
# GH 10132
for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
c, i = key
result = groups_double_key.get_group(key)
expected = test[(test.cat == c) & (test.ints == i)]
assert_frame_equal(result, expected)
d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
test = pd.DataFrame(d)
values = pd.cut(test['C1'], [1, 2, 3, 6])
values.name = "cat"
groups_double_key = test.groupby([values, 'C2'])
res = groups_double_key.agg('mean')
nan = np.nan
idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"],
[1, 2, 3, 4]],
names=["cat", "C2"])
exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3,
nan, nan, nan, nan, 4, 5],
"C3": [nan, nan, nan, nan, 10, 100,
nan, nan, nan, nan, 200, 34]}, index=idx)
tm.assert_frame_equal(res, exp)
def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="softmax_model"):
column_list = training_df.columns.tolist()
threshold = 5
red_wine_cleaned = training_df.copy()
red_wine_cleaned = _outliers(red_wine_cleaned, threshold, column_list[0:-1])
# Bin the data
bins = [3, 5, 6, 8]
red_wine_cleaned['category'] = pd.cut(red_wine_cleaned.quality, bins, labels=['Bad', 'Average', 'Good'],
include_lowest=True)
# Only include 'Bad' and 'Good' categories
red_wine_newcats = red_wine_cleaned[red_wine_cleaned['category'].isin(['Bad', 'Good'])].copy()
bins = [3, 5, 8]
red_wine_newcats['category'] = pd.cut(red_wine_newcats.quality,
bins, labels=['Bad', 'Good'], include_lowest=True)
y_red_wine = red_wine_newcats[['category']].get_values()
# Removing fixed_acidity and quality
X_red_wine = red_wine_newcats.iloc[:, 1:-2].get_values()
y_red_wine_raveled = y_red_wine.ravel()
y_red_wine_integers = [y.replace('Bad', '1') for y in y_red_wine_raveled]
y_red_wine_integers = [y.replace('Good', '0') for y in y_red_wine_integers]
y_red_wine_integers = [np.int(y) for y in y_red_wine_integers]
y_one_hot = _dense_to_one_hot(y_red_wine_integers, num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(X_red_wine, y_one_hot, test_size=0.2, random_state=42)
# model
with tf.variable_scope("softmax_regression"):
X = tf.placeholder("float", [None, 10])
y, variables = softmax_regression(X)
# train
y_ = tf.placeholder("float", [None, 2])
cost = -tf.reduce_mean(y_ * tf.log(y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
init = tf.initialize_all_variables()
self.sess.run(init)
for i in range(100):
average_cost = 0
number_of_batches = int(len(X_train) / batch_size)
for start, end in zip(range(0, len(X_train), batch_size), range(batch_size, len(X_train), batch_size)):
self.sess.run(optimizer, feed_dict={X: X_train[start:end], y_: y_train[start:end]})
# Compute average loss
average_cost += self.sess.run(cost, feed_dict={X: X_train[start:end],
y_: y_train[start:end]}) / number_of_batches
print(self.sess.run(accuracy, feed_dict={X: X_test, y_: y_test}))
filename = "data/softmax_regression.ckpt"
path = self.save_locally(filename)
self.save_to_s3(path, model_name)
print("Saved:", path)
def plot_yield_by_quality():
# Close any previous plots
plt.close('all')
# Read in seqlength and time from ALL_READS dataframe
new_yield_data = ALL_READS[['time', "seq_length", "av_qual"]]
# Bin qualities
qual_bins = [0] + QUALITY_BINS + [new_yield_data["av_qual"].max()]
# Cut yield data into quality bins
new_yield_data["descriptive_quality"] = pd.cut(new_yield_data["av_qual"], qual_bins,
labels=[description
for description in reversed(QUALITY_DESCRIPTIONS)])
# Time as index and drop av_qual column
new_yield_data.set_index(pd.DatetimeIndex(new_yield_data['time']), inplace=True)
new_yield_data.drop('av_qual', axis=1, inplace=True)
# Obtain cumulative sum by quality bin in each minute.
yield_data_grouped = new_yield_data.groupby("descriptive_quality").apply(lambda d: d.resample("1T").sum().fillna(0))['seq_length']
# Create a dict of dataframes based on groups.
yield_data_by_quality = {description: yield_data_grouped[description].to_frame().reset_index()
for description in
QUALITY_DESCRIPTIONS}
for description, yield_df in yield_data_by_quality.items():
yield_df.reset_index(inplace=True)
yield_df.set_index("time", inplace=True)
yield_df = yield_df.reindex(index=YIELD_DATA.time, fill_value=0)
yield_df.reset_index(inplace=True)
# Generate a cumulative sum of sequence data
yield_df['cumsum_bp'] = yield_df['seq_length'].cumsum()
# Convert time to timedelta format and then to float format, in hours.
yield_df['duration_tdelta'] = yield_df['time'].apply(lambda t: t - yield_df['time'].min())
yield_df['duration_float'] = yield_df['duration_tdelta'].apply(lambda t: t.total_seconds() / 3600)
yield_data_by_quality[description] = yield_df
# Set subplots.
fig, ax = plt.subplots(1)
# Create ticks using numpy linspace. Ideally will create 6 points between 0 and 48 hours.
num_points = 7 # Need to include zero point
x_ticks = np.linspace(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max(), num_points)
ax.set_xticks(x_ticks)
# Define axis formatters
ax.yaxis.set_major_formatter(FuncFormatter(y_yield_to_human_readable))
ax.xaxis.set_major_formatter(FuncFormatter(x_yield_to_human_readable))
# Set x and y labels and title.
ax.set_xlabel("Duration (HH:MM)")
ax.set_ylabel("Yield")
ax.set_title(f"Yield for {SAMPLE_NAME} over time by quality")
ax.stackplot(YIELD_DATA['duration_float'],
[yield_data_by_quality[description]['cumsum_bp']
for description in QUALITY_DESCRIPTIONS],
colors=QUALITY_COLOURS)
# Limits must be set after the plot is created
ax.set_xlim(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max())
ax.set_ylim(ymin=0)
# Add legend to plot.
ax.legend([mpatches.Patch(color=colour)
for colour in QUALITY_COLOURS],
QUALITY_DESCRIPTIONS, loc=2)
# Ensure labels are not missed.
fig.tight_layout()
savefig(os.path.join(PLOTS_DIR, f"{SAMPLE_NAME.replace(' ', '_')}_yield_plot_by_quality.png"))
def titanic_1():
titanic = sns.load_dataset('titanic')
print titanic.head()
# survived pclass sex age ......
# 0 0 male 22
# 1 1 1 female 38.0
# 2 1 3 female 26.0
# 3 1 1 female 35.0
# 4 0 3 male 35.0
print titanic.groupby('sex')[['survived']].mean()
# survived
# sex
# female 0.742038
# male 0.188908
print titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
# class First Second Third
# sex
# female 0.968085 0.921053 0.500000
# male 0.368852 0.157407 0.135447
print titanic.pivot_table('survived', index='sex', columns='class')
# class First Second Third
# sex
# female 0.968085 0.921053 0.500000
# male 0.368852 0.157407 0.135447
age = pd.cut(titanic['age'], [0, 18, 80])
print titanic.pivot_table('survived', ['sex', age], 'class')
# class First Second Third
# sex age
# female (0, 18] 0.909091 1.000000 0.511628
# (18, 80] 0.972973 0.900000 0.423729
# male (0, 18] 0.800000 0.600000 0.215686
# (18, 80] 0.375000 0.071429 0.133663
print titanic.pivot_table(index='sex', columns='class',
aggfunc={'survived': sum, 'fare': 'mean'})
print titanic.pivot_table('survived', index='sex', columns='class', margins=True)
# class First Second Third All
# sex
# female 0.968085 0.921053 0.500000 0.742038
# male 0.368852 0.157407 0.135447 0.188908
# All 0.629630 0.472826 0.242363 0.383838
def prepare_input_data(self, input_data, name="", category_map=None):
'''
Prepare input data dicts
'''
print ("-"*40 + " Preparing %s" % name)
X = input_data[self.continuous_columns].values.astype(np.float32)
Y = input_data[self.label_column].values.astype(np.float32)
Y = Y.reshape([-1, 1])
if self.verbose:
print (" Y shape=%s, X shape=%s" % (Y.shape, X.shape))
X_dict = {"wide_X": X}
if 'deep' in self.model_type:
# map categorical value strings to integers
td = input_data
if category_map is None:
category_map = {}
for cc in self.categorical_columns:
if not cc in td.columns:
continue
cc_values = sorted(td[cc].unique())
cc_max = 1+len(cc_values)
cc_map = dict(zip(cc_values, range(1, cc_max))) # start from 1 to avoid 0:0 mapping (save 0 for missing)
if self.verbose:
print (" category %s max=%s, map=%s" % (cc, cc_max, cc_map))
category_map[cc] = cc_map
td = td.replace(category_map)
# bin ages (cuts off extreme values)
age_bins = [ 0, 12, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 80, 65535 ]
td['age_binned'] = pd.cut(td['age'], age_bins, labels=False)
td = td.replace({'age_binned': {np.nan: 0}})
print (" %d age bins: age bins = %s" % (len(age_bins), age_bins))
X_dict.update({ ("%s_in" % cc): td[cc].values.astype(np.int32).reshape([-1, 1]) for cc in self.categorical_columns})
Y_dict = {"Y": Y}
if self.verbose:
print ("-"*40)
return X_dict, Y_dict, category_map