def plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size=4000, normalization_by_all=False):
count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y], df[z_boolean]], axis=1)
count_table = count_table.groupby([x,z_boolean])[y].value_counts().unstack().fillna(0)
count_table = count_table.unstack()
count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
z_boolean_values = count_table_long[z_boolean].unique()
ratio = pd.DataFrame({'ratio':count_table_long.set_index([x,y,z_boolean]).unstack()['value'][z_boolean_values[1]] / (
count_table_long.set_index([x,y,z_boolean]).unstack()['value'].sum(axis=1) )})
count_table_long = count_table_long.set_index([x, y ])[['value']].merge(ratio, left_index=True, right_index=True).reset_index()
size_factor = maximal_bubble_size/count_table_long['value'].max()
x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
xticks = np.arange(len(ordered_x_values)) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
yticks = np.arange(len(ordered_y_values)) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
count_table_long[x] = count_table_long[x].map(x_values_dict)
count_table_long[y] = count_table_long[y].map(y_values_dict)
plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
c=count_table_long['ratio'], alpha=0.5,
cmap='cool')
return count_table_long, xticks, yticks, xticklabels, yticklabels
评论列表
文章目录