def plot_yield_by_quality():
# Close any previous plots
plt.close('all')
# Read in seqlength and time from ALL_READS dataframe
new_yield_data = ALL_READS[['time', "seq_length", "av_qual"]]
# Bin qualities
qual_bins = [0] + QUALITY_BINS + [new_yield_data["av_qual"].max()]
# Cut yield data into quality bins
new_yield_data["descriptive_quality"] = pd.cut(new_yield_data["av_qual"], qual_bins,
labels=[description
for description in reversed(QUALITY_DESCRIPTIONS)])
# Time as index and drop av_qual column
new_yield_data.set_index(pd.DatetimeIndex(new_yield_data['time']), inplace=True)
new_yield_data.drop('av_qual', axis=1, inplace=True)
# Obtain cumulative sum by quality bin in each minute.
yield_data_grouped = new_yield_data.groupby("descriptive_quality").apply(lambda d: d.resample("1T").sum().fillna(0))['seq_length']
# Create a dict of dataframes based on groups.
yield_data_by_quality = {description: yield_data_grouped[description].to_frame().reset_index()
for description in
QUALITY_DESCRIPTIONS}
for description, yield_df in yield_data_by_quality.items():
yield_df.reset_index(inplace=True)
yield_df.set_index("time", inplace=True)
yield_df = yield_df.reindex(index=YIELD_DATA.time, fill_value=0)
yield_df.reset_index(inplace=True)
# Generate a cumulative sum of sequence data
yield_df['cumsum_bp'] = yield_df['seq_length'].cumsum()
# Convert time to timedelta format and then to float format, in hours.
yield_df['duration_tdelta'] = yield_df['time'].apply(lambda t: t - yield_df['time'].min())
yield_df['duration_float'] = yield_df['duration_tdelta'].apply(lambda t: t.total_seconds() / 3600)
yield_data_by_quality[description] = yield_df
# Set subplots.
fig, ax = plt.subplots(1)
# Create ticks using numpy linspace. Ideally will create 6 points between 0 and 48 hours.
num_points = 7 # Need to include zero point
x_ticks = np.linspace(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max(), num_points)
ax.set_xticks(x_ticks)
# Define axis formatters
ax.yaxis.set_major_formatter(FuncFormatter(y_yield_to_human_readable))
ax.xaxis.set_major_formatter(FuncFormatter(x_yield_to_human_readable))
# Set x and y labels and title.
ax.set_xlabel("Duration (HH:MM)")
ax.set_ylabel("Yield")
ax.set_title(f"Yield for {SAMPLE_NAME} over time by quality")
ax.stackplot(YIELD_DATA['duration_float'],
[yield_data_by_quality[description]['cumsum_bp']
for description in QUALITY_DESCRIPTIONS],
colors=QUALITY_COLOURS)
# Limits must be set after the plot is created
ax.set_xlim(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max())
ax.set_ylim(ymin=0)
# Add legend to plot.
ax.legend([mpatches.Patch(color=colour)
for colour in QUALITY_COLOURS],
QUALITY_DESCRIPTIONS, loc=2)
# Ensure labels are not missed.
fig.tight_layout()
savefig(os.path.join(PLOTS_DIR, f"{SAMPLE_NAME.replace(' ', '_')}_yield_plot_by_quality.png"))
评论列表
文章目录