plot_yields.py 文件源码-python代码片段

def plot_yield_by_quality():
    # Close any previous plots
    plt.close('all')
    # Read in seqlength and time from ALL_READS dataframe
    new_yield_data = ALL_READS[['time', "seq_length", "av_qual"]]
    # Bin qualities
    qual_bins = [0] + QUALITY_BINS + [new_yield_data["av_qual"].max()]
    # Cut yield data into quality bins
    new_yield_data["descriptive_quality"] = pd.cut(new_yield_data["av_qual"], qual_bins,
                                                   labels=[description
                                                           for description in reversed(QUALITY_DESCRIPTIONS)])
    # Time as index and drop av_qual column
    new_yield_data.set_index(pd.DatetimeIndex(new_yield_data['time']), inplace=True)
    new_yield_data.drop('av_qual', axis=1, inplace=True)
    # Obtain cumulative sum by quality bin in each minute.
    yield_data_grouped = new_yield_data.groupby("descriptive_quality").apply(lambda d: d.resample("1T").sum().fillna(0))['seq_length']
    # Create a dict of dataframes based on groups.
    yield_data_by_quality = {description: yield_data_grouped[description].to_frame().reset_index()
                             for description in
                             QUALITY_DESCRIPTIONS}

    for description, yield_df in yield_data_by_quality.items():
        yield_df.reset_index(inplace=True)
        yield_df.set_index("time", inplace=True)
        yield_df = yield_df.reindex(index=YIELD_DATA.time, fill_value=0)
        yield_df.reset_index(inplace=True)
        # Generate a cumulative sum of sequence data
        yield_df['cumsum_bp'] = yield_df['seq_length'].cumsum()
        # Convert time to timedelta format and then to float format, in hours.
        yield_df['duration_tdelta'] = yield_df['time'].apply(lambda t: t - yield_df['time'].min())
        yield_df['duration_float'] = yield_df['duration_tdelta'].apply(lambda t: t.total_seconds() / 3600)
        yield_data_by_quality[description] = yield_df

    # Set subplots.
    fig, ax = plt.subplots(1)
    # Create ticks using numpy linspace. Ideally will create 6 points between 0 and 48 hours.
    num_points = 7  # Need to include zero point
    x_ticks = np.linspace(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max(), num_points)
    ax.set_xticks(x_ticks)
    # Define axis formatters
    ax.yaxis.set_major_formatter(FuncFormatter(y_yield_to_human_readable))
    ax.xaxis.set_major_formatter(FuncFormatter(x_yield_to_human_readable))
    # Set x and y labels and title.
    ax.set_xlabel("Duration (HH:MM)")
    ax.set_ylabel("Yield")
    ax.set_title(f"Yield for {SAMPLE_NAME} over time by quality")
    ax.stackplot(YIELD_DATA['duration_float'],
                 [yield_data_by_quality[description]['cumsum_bp']
                  for description in QUALITY_DESCRIPTIONS],
                 colors=QUALITY_COLOURS)
    # Limits must be set after the plot is created
    ax.set_xlim(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max())
    ax.set_ylim(ymin=0)

    # Add legend to plot.
    ax.legend([mpatches.Patch(color=colour)
               for colour in QUALITY_COLOURS],
              QUALITY_DESCRIPTIONS, loc=2)
    # Ensure labels are not missed.
    fig.tight_layout()
    savefig(os.path.join(PLOTS_DIR, f"{SAMPLE_NAME.replace(' ', '_')}_yield_plot_by_quality.png"))