def makeFig1():
ts = getFig1Ts()
# set up axes
ax1 = plt.subplot2grid((2,2), (0,0), colspan=2)
ax2 = plt.subplot2grid((2,2), (1,0))
ax3 = plt.subplot2grid((2,2), (1,1))
axes = [ax1, ax2, ax3]
for ax in axes:
ax.autoscale(tight=True)
sb.despine(left=True, ax=ax)
ts.plot(showLabels=False, showBounds=False, ax=ax1)
lengths = [150]
ts_sota = labelTs_sota(ts, lengths)
ts_sota.plot(showLabels=False, ax=ax2)
ts_ff = labelTs_ff(ts, 100, 200) # Lmin, Lmax
ts_ff.plot(showLabels=False, ax=ax3)
plt.setp(ax3.get_yticklabels(), visible=False)
ax1.set_title("Patterns in Dishwasher Dataset")
ax1.set_xlabel("Minute")
ax2.set_title("State-of-the-art")
ax3.set_title("Proposed")
plt.tight_layout()
plt.show()
python类despine()的实例源码
def enrich_signature(method="pca", percentile=99, results_dir="results", experiment="CROP-seq_Jurkat_TCR", n_genes=500):
"""
"""
diff = pd.read_csv(os.path.join(results_dir, "{}.differential_expression.{}.stimutation.csv".format(experiment, method)), squeeze=True, index_col=0, header=None, names=["gene_name"])
degs = pd.Series(diff[abs(diff) > np.percentile(abs(diff), percentile)].index)
degs.name = "gene_name"
enr = enrichr(degs.reset_index())
enr.to_csv(os.path.join(results_dir, "differential_expression.{}.enrichr.csv".format(method)), index=False, encoding="utf8")
# Plot top N terms of each library
n = 8
to_plot = [
'GO_Biological_Process_2015',
"KEGG_2016",
"WikiPathways_2016",
"Reactome_2016",
"BioCarta_2016",
"NCI-Nature_2016"]
p = enr.ix[enr[enr['gene_set_library'].isin(to_plot)].groupby("gene_set_library")['combined_score'].nlargest(n).index.get_level_values(1)].sort_values("combined_score", ascending=False)
fig, axis = plt.subplots(1)
sns.barplot(data=p, y="description", x="combined_score", orient="horiz", hue="gene_set_library")
axis.set_xlabel("Combined score")
sns.despine(fig)
fig.savefig(os.path.join(results_dir, "differential_expression.{}.enrichr.top{}_terms.svg".format(method, n)), bbox_inches="tight")
def gRNA_scatter(s1, s2, prefix="", text=False, n_labels=30):
# Scatter of gRNA change
fig, axis = plt.subplots(3, 2, sharex=False, sharey=False, figsize=(8, 8))
axis = axis.flatten()
for i, screen in enumerate(s2.columns[::-1]):
x = s1.join(s2) # .fillna(0)
x = x.iloc[np.random.permutation(len(x))]
x = x.ix[x.index[~x.index.str.contains("Wnt")]]
if prefix.startswith("mid_screen-"):
b = x["gDNA_Jurkat"]
else:
b = x["plasmid_pool_TCR"]
x = x.fillna(0)
b = b.fillna(0)
colors = pd.DataFrame()
colors[sns.color_palette("colorblind")[0]] = x.index.str.contains("Wnt")
colors[sns.color_palette("colorblind")[1]] = x.index.str.contains("CTRL")
colors[sns.color_palette("colorblind")[2]] = x.index.str.contains("Tcr")
colors[sns.color_palette("colorblind")[3]] = x.index.str.contains("Ess")
colors = colors.apply(lambda x: x[x].index.tolist()[0], axis=1).tolist()
axis[i].scatter(np.log2(1 + x[screen]), np.log2(1 + b), color=colors, alpha=0.5)
if text:
for j in x[x.index.str.contains("ETS1|GATA3|RUNX1")].index:
axis[i].text(np.log2(1 + x[screen].ix[j]), np.log2(1 + b.ix[j]), j)
# x = y line
lims = [np.nanmin([np.log2(1 + x[screen]), np.log2(1 + b)]), np.nanmax([np.log2(1 + x[screen]), np.log2(1 + b)])]
axis[i].plot((lims[0], lims[1]), (lims[0], lims[1]), linestyle='--', color='black', alpha=0.75)
axis[i].set_title(screen)
for i in range(0, len(axis), 2):
axis[i].set_ylabel("gRNA frequency in plasmid (log2)")
for ax in axis[-2:]:
ax.set_xlabel("gRNA frequency in CROP-seq screen (log2)")
sns.despine(fig)
fig.savefig(os.path.join(results_dir, "gRNA_counts.norm.{}.scatter.{}svg".format(prefix, "text." if text else "")), bbox_inches="tight")
fig.savefig(os.path.join(results_dir, "gRNA_counts.norm.{}.scatter.{}pdf".format(prefix, "text." if text else "")), bbox_inches="tight")
def main(args):
with FastaReader(args.fasta) as fr:
sequences = list(fr)
logger.info('Plotting dendrogram of %s sequences', len(sequences))
if args.mark:
with FastaReader(args.mark) as fr:
mark = PrefixComparer(record.sequence for record in fr)
labels = []
n_new = 0
for record in sequences:
if record.sequence not in mark:
extra = ' (new)'
n_new += 1
else:
extra = ''
labels.append(record.name + extra)
logger.info('%s sequence(s) marked as "new"', n_new)
else:
labels = [s.name for s in sequences]
sns.set_style("white")
font_size = 297 / 25.4 * 72 / (len(labels) + 5)
font_size = min(16, max(6, font_size))
height = font_size * (len(labels) + 5) / 72
fig = plt.figure(figsize=(210 / 25.4, height))
matplotlib.rcParams.update({'font.size': 4})
ax = fig.gca()
sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
sns.set_style('whitegrid')
if len(sequences) >= 2:
m = distances([s.sequence for s in sequences])
y = distance.squareform(m)
mindist = int(y.min())
logger.info('Smallest distance is %s. Found between:', mindist)
for i,j in np.argwhere(m == y.min()):
if i < j:
logger.info('%s and %s', labels[i], labels[j])
l = hierarchy.linkage(y, method=args.method)
hierarchy.dendrogram(l, labels=labels, leaf_font_size=font_size, orientation='right', color_threshold=0.95*max(l[:,2]))
else:
ax.text(0.5, 0.5, 'no sequences', fontsize='xx-large')
ax.grid(False)
fig.set_tight_layout(True)
fig.savefig(args.plot)
def plot_box(df, x, y, hue, tag='eda', directory=None):
r"""Display a Box Plot.
Parameters
----------
df : pandas.DataFrame
The dataframe containing the ``x`` and ``y`` features.
x : str
Variable name in ``df`` to display along the x-axis.
y : str
Variable name in ``df`` to display along the y-axis.
hue : str
Variable name to be used as hue, i.e., another data dimension.
tag : str
Unique identifier for the plot.
directory : str, optional
The full specification of the plot location.
Returns
-------
None : None.
References
----------
http://seaborn.pydata.org/generated/seaborn.boxplot.html
"""
logger.info("Generating Box Plot")
# Generate the box plot
box_plot = sns.boxplot(x=x, y=y, hue=hue, data=df)
sns.despine(offset=10, trim=True)
box_fig = box_plot.get_figure()
# Save the plot
write_plot('seaborn', box_fig, 'box_plot', tag, directory)
#
# Function plot_swarm
#
def wasabiplot(bam_filename, chrom, start, stop, strand, log_base=10,
color='steelblue', bad_cigar=INSERTION_DELETIONS,
coverage_cigar=COVERAGE_CIGAR, junction_cigar=JUNCTION_CIGAR,
ax=None, coverage_kws=None, curve_height_multiplier=0.2,
text_kws=TEXT_KWS, patch_kws=PATCH_KWS, warn_skipped=True,
annotate=True, **kwargs):
"""Get the number of reads that matched to the reference sequence
Parameters
----------
bam_filename : str
Name of the bam filename for logging purposes
chrom : str
Name of the reference chromosome
start, stop : int
Genome-based locations of the start and stop regions
strand : '+' | '-'
Strand to query
log_base : number or None, optional
The base to use for log-scaling the data. e.g. 10 would have log10 data
If None, the data is not log-scaled. (default=10)
color : valid matplotlib color
Color to use for both the coverage and junction plotting
allowed_cigar : tuple of str, optional
Which CIGAR string flags are allowed. (default=('M') aka match)
bad_cigar : tuple of str, optional
Which CIGAR string flags are not allowed. (default=('I', 'D') aka
insertion and deletion)
"""
if isinstance(bam_filename, pd.Series):
bam_filename = bam_filename.iloc[0]
plotter = WasabiPlotter(bam_filename, chrom, start, stop, strand, log_base,
color, bad_cigar, coverage_cigar, junction_cigar,
warn_skipped)
if ax is None:
ax = plt.gca()
coverage_kws = {} if coverage_kws is None else coverage_kws
coverage_kws.update(kwargs)
plotter.plot_coverage(color, ax, **coverage_kws)
plotter.plot_junctions(ax, curve_height_multiplier=curve_height_multiplier,
text_kws=text_kws, patch_kws=patch_kws,
annotate=annotate)
# Remove bottom spine
sns.despine(ax=ax, bottom=True)
# Add a zero-axis line
ax.hlines(0, 0, plotter.length, linewidth=0.5, zorder=-1)
if ax.is_last_row():
xticks = [int(x + start) for x in ax.get_xticks()]
ax.set(xticklabels=xticks)
def bars(data,color='black',title=''):
data = pd.DataFrame(data.value_counts())
data = data.reset_index()
data.columns = ['keyword','value']
data['keyword'] = data['keyword'][1:]
data = data.dropna()
data = data.reset_index(drop=True)
data = data.sort_values('value',ascending=False)
sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 0})
x = data.head(20)['keyword'].astype(str)
y = data.head(20)['value'].astype(int)
f, ax = plt.subplots(figsize=(16, 3))
sns.set_style('white')
## change color of the bar based on value
colors = [color if _y >=0 else 'red' for _y in y]
sns.barplot(x, y, palette=colors, ax=ax)
plt.title(title, fontsize=18, y=1.12, color="gray");
ax.set_xticklabels('')
ax.set_ylabel('')
ax.set_xlabel('')
ax.tick_params(axis='both', which='major', pad=30)
for n, (label, _y) in enumerate(zip(x, y)):
ax.annotate(
s='{:.1f}'.format(abs(_y)),
xy=(n, _y),
ha='center',va='center',
xytext=(0,-10),
size=12,
textcoords='offset points',
color="white",
weight="bold"
)
ax.set_yticklabels("");
ax.set_xticklabels(data.head(20)['keyword'],rotation=25,ha="right");
ax.tick_params(axis='both', which='major', pad=15)
sns.despine(left=True)
def plotStackedBarsScalar(df, indexCol, columns, valuesCol, box=False, rotation=90,
zeroLine=False, title="", xlabel='', ylabel='', ncol=5, ygrid=False,
yticks=False, ymin=None, ymax=None, barWidth=0.5, legendY=None,
palette=None, outFile=None, sideLabel=False, labelColor=None,
yFormat=None, transparent=False, openFile=False, closeFig=True):
'''
Plot a stacked bar plot using data in df, given the index column, the
column holding the values to pivot to columns, and the column holding
the values. The argument 'ncol' specifies the number of columns with
which to render the legend.
'''
#_logger.debug('plotStackedBarsScalar %s', sideLabel)
setupPlot()
# TBD: handle year values as columns to plot
df2 = df[[indexCol, columns, valuesCol]].pivot(index=indexCol, columns=columns, values=valuesCol)
setupPalette(len(df2.columns), pal=palette)
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
df2.plot(kind='bar', stacked=True, ax=ax, grid=False, width=barWidth, rot=rotation)
if box == False:
sns.despine(left=True)
if yticks:
plt.tick_params(axis='y', direction='out', length=5, width=.75,
colors='k', left='on', right='off')
if zeroLine:
ax.axhline(0, color='k', linewidth=0.75, linestyle='-')
if ygrid:
ax.yaxis.grid(color='lightgrey', linestyle='solid')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
legendY = -0.6 if legendY is None else legendY
ax.legend(loc='upper center', bbox_to_anchor=(0.5, legendY), ncol=ncol)
if title:
ax.set_title(title, y=1.05)
if ymin is not None or ymax is not None:
ax.set_autoscale_on(False)
ax.set_ylim(ymin, ymax)
_finalizeFigure(fig, ax, outFile=outFile, sideLabel=sideLabel, labelColor=labelColor,
yFormat=yFormat, transparent=transparent, openFile=openFile, closeFig=closeFig)
return (fig, ax)
def plotStackedTimeSeries(df, index='region', xlabel='', ylabel='', ncol=5, box=False,
zeroLine=False, title="", ygrid=False, yticks=False,
ymin=None, ymax=None, barWidth=0.5, legendY=None, yearStep=5,
palette=None, outFile=None, sideLabel=False, labelColor=None,
yFormat=None, transparent=False, openFile=False, closeFig=True):
#_logger.debug('plotStackedTimeSeries %s', sideLabel)
setupPlot()
df = dropExtraCols(df, inplace=False)
grouped = df.groupby(index)
df2 = grouped.aggregate(np.sum)
df3 = df2.transpose()
setupPalette(len(df3.columns), pal=palette)
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
df3.plot(kind='bar', stacked=True, ax=ax, grid=False, width=barWidth)
# space out year labels to every 5 years
locs, labels = plt.xticks()
yearCols = filter(str.isdigit, df.columns)
if int(yearCols[1]) - int(yearCols[0]) == 1 and yearStep > 1:
plt.xticks(locs[::yearStep], yearCols[::yearStep])
if box == False:
sns.despine(left=True)
if yticks:
plt.tick_params(axis='y', direction='out', length=5, width=.75,
colors='k', left='on', right='off')
lines = ax.get_lines()
if lines:
lines[0].set_visible(False) # get rid of ugly dashed line
if zeroLine:
ax.axhline(0, color='k', linewidth=0.75, linestyle='-')
if ygrid:
ax.yaxis.grid(color='lightgrey', linestyle='solid')
if ymin is not None or ymax is not None:
ax.set_autoscale_on(False)
ax.set_ylim(ymin, ymax)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
legendY = -0.2 if legendY is None else legendY
ax.legend(loc='upper center', bbox_to_anchor=(0.5, legendY), ncol=ncol)
if title:
ax.set_title(title, y=1.05)
_finalizeFigure(fig, ax, outFile=outFile, sideLabel=sideLabel, labelColor=labelColor,
yFormat=yFormat, transparent=transparent, openFile=openFile, closeFig=closeFig)
return (fig, ax)
def plotTimeSeries(df, xlabel='', ylabel='', box=False, zeroLine=False, title="", ygrid=False,
yticks=False, ymin=None, ymax=None, legend=False, legendY=None, yearStep=5,
outFile=None, sideLabel=False, labelColor=None, yFormat=None, transparent=False,
openFile=False, closeFig=True):
setupPlot()
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
yearCols = filter(str.isdigit, df.columns)
x = map(int, yearCols)
y = list(df[yearCols].iloc[0])
plt.plot(x, y)
# TBD: see if this is worth doing
# space out year labels to every 5 years
#locs, labels = plt.xticks()
#plt.xticks(locs[::yearStep], yearCols[::yearStep])
if box == False:
sns.despine(left=True)
if yticks:
plt.tick_params(axis='y', direction='out', length=5, width=.75,
colors='k', left='on', right='off')
if zeroLine:
ax.axhline(0, color='k', linewidth=0.75, linestyle='-')
if ygrid:
ax.yaxis.grid(color='lightgrey', linestyle='solid')
if ymin is not None or ymax is not None:
ax.set_autoscale_on(False)
ax.set_ylim(ymin, ymax)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
if legend:
legendY = -0.2 if legendY is None else legendY
ax.legend(loc='upper center', bbox_to_anchor=(0.5, legendY))
else:
ax.legend([], frameon=False)
if title:
ax.set_title(title, y=1.05)
_finalizeFigure(fig, ax, outFile=outFile, sideLabel=sideLabel, labelColor=labelColor,
yFormat=yFormat, transparent=transparent, openFile=openFile, closeFig=closeFig)
return (fig, ax)
def gRNA_maplot(s1, s2, prefix="", text=False, n_labels=30):
# Rank of gRNA change
fig, axis = plt.subplots(3, 2, sharex=True, sharey=True, figsize=(8, 8))
axis = axis.flatten()
for i, screen in enumerate(s2.columns[::-1]):
x = s1.join(s2) # .fillna(0)
x = x.iloc[np.random.permutation(len(x))]
x = x.ix[x.index[~x.index.str.contains("Wnt")]]
if prefix.startswith("mid_screen-"):
b = x["gDNA_Jurkat"]
else:
b = x["plasmid_pool_TCR"]
x = x.fillna(0)
b = b.fillna(0)
M = np.log2(x[screen] * b) / 2.
M = M.replace({-np.inf: 0, np.inf: 9})
fc = np.log2(1 + x[screen]) - np.log2(1 + b)
fc.name = screen
if i == 0:
xx = pd.DataFrame(fc)
else:
xx = xx.join(fc, how="outer")
colors = pd.DataFrame()
colors[sns.color_palette("colorblind")[0]] = x.index.str.contains("Wnt")
colors[sns.color_palette("colorblind")[1]] = x.index.str.contains("CTRL")
colors[sns.color_palette("colorblind")[2]] = x.index.str.contains("Tcr")
colors[sns.color_palette("colorblind")[3]] = x.index.str.contains("Ess")
colors = colors.apply(lambda x: x[x].index.tolist()[0], axis=1).tolist()
axis[i].scatter(M, fc, color=colors, alpha=0.5)
if text:
for j in x[x.index.str.contains("ETS1|GATA3|RUNX1")].index:
axis[i].text(
M.ix[j],
fc.ix[j],
j)
axis[i].axhline(y=0, color='black', linestyle='--', lw=0.5)
axis[i].set_title(screen)
for i in range(0, len(axis), 2):
axis[i].set_ylabel("M")
for ax in axis[-2:]:
ax.set_xlabel("A")
sns.despine(fig)
fig.savefig(os.path.join(results_dir, "gRNA_counts.norm.{}.maplot.{}svg".format(prefix, "text." if text else "")), bbox_inches="tight")
fig.savefig(os.path.join(results_dir, "gRNA_counts.norm.{}.maplot.{}pdf".format(prefix, "text." if text else "")), bbox_inches="tight")
def gRNA_rank(s1, s2, prefix="", text=False, n_labels=30):
# Rank of gRNA change
fig, axis = plt.subplots(3, 2, sharex=True, sharey=True, figsize=(8, 8))
axis = axis.flatten()
for i, screen in enumerate(s2.columns[::-1]):
x = s1.join(s2) # .fillna(0)
x = x.iloc[np.random.permutation(len(x))]
x = x.ix[x.index[~x.index.str.contains("Wnt")]]
if prefix.startswith("mid_screen-"):
b = x["gDNA_Jurkat"]
else:
b = x["plasmid_pool_TCR"]
x = x.fillna(0)
b = b.fillna(0)
fc = np.log2(1 + x[screen]) - np.log2(1 + b)
fc.name = screen
if i == 0:
xx = pd.DataFrame(fc)
else:
xx = xx.join(fc, how="outer")
colors = pd.DataFrame()
colors[sns.color_palette("colorblind")[0]] = x.index.str.contains("Wnt")
colors[sns.color_palette("colorblind")[1]] = x.index.str.contains("CTRL")
colors[sns.color_palette("colorblind")[2]] = x.index.str.contains("Tcr")
colors[sns.color_palette("colorblind")[3]] = x.index.str.contains("Ess")
colors = colors.apply(lambda x: x[x].index.tolist()[0], axis=1).tolist()
axis[i].scatter(fc.rank(ascending=False, method="first"), fc, color=colors, alpha=0.5)
if text:
for j in x[x.index.str.contains("ETS1|GATA3|RUNX1")].index:
axis[i].text(
fc.rank(ascending=False, method="first").ix[j],
fc.ix[j],
j)
axis[i].axhline(y=0, color='black', linestyle='--', lw=0.5)
axis[i].set_title(screen)
for i in range(0, len(axis), 2):
axis[i].set_ylabel("gRNA fold-change")
for ax in axis[-2:]:
ax.set_xlabel("gRNA rank")
sns.despine(fig)
fig.savefig(os.path.join(results_dir, "gRNA_counts.norm.{}.rank.{}svg".format(prefix, "text." if text else "")), bbox_inches="tight")
fig.savefig(os.path.join(results_dir, "gRNA_counts.norm.{}.rank.{}pdf".format(prefix, "text." if text else "")), bbox_inches="tight")
# Save ranked list
xx.to_csv(os.path.join(results_dir, "gRNA_counts.norm.{}.rank.csv".format(prefix)), index=True)
# Save ranked list of gene-level measurements, reduced by mean and min
m = pd.merge(xx.reset_index(), guide_annotation[["oligo_name", "gene"]], left_on="gRNA_name", right_on="oligo_name").drop("oligo_name", axis=1).set_index(["gene", "gRNA_name"])
m.groupby(level=[0]).mean().to_csv(os.path.join(results_dir, "gRNA_counts.norm.{}.gene_mean.rank.csv".format(prefix)), index=True)
m.groupby(level=[0]).min().to_csv(os.path.join(results_dir, "gRNA_counts.norm.{}.gene_min.rank.csv".format(prefix)), index=True)
def gRNA_rank_stimulus(xx, s2, prefix=""):
# Difference between unstimulated/stimulated
fig, axis = plt.subplots(1, 3, sharex=False, sharey=True, figsize=(12, 3))
axis = axis.flatten()
for i, screen in enumerate(s2.columns[::-1]):
x = s1.join(s2) # .fillna(0)
x = x.iloc[np.random.permutation(len(x))]
if ("TCR" in screen) or ("Jurkat" in screen):
x = x.ix[x.index[~x.index.str.contains("Wnt")]]
if prefix.startswith("mid_screen-"):
b = x["gDNA_Jurkat"]
else:
b = x["plasmid_pool_TCR"]
elif ("WNT" in screen) or ("HEK" in screen):
x = x.ix[x.index[~x.index.str.contains("Tcr")]]
if prefix.startswith("mid_screen-"):
if "_4_" in prefix:
b = x["gDNA_HEKclone4"]
else:
b = x["gDNA_HEKclone6"]
else:
b = x["plasmid_pool_WNT"]
fc = np.log2(1 + x[screen]) - np.log2(1 + b)
fc.name = screen
if i == 0:
xx = pd.DataFrame(fc)
else:
xx = xx.join(fc, how="outer")
screens = s2.columns[::-1]
for i in range(0, len(s2.columns), 2):
fc = (xx[screens[i + 1]] - xx[screens[i]]).dropna()
fc.name = screens[i + 1]
if i == 0:
axis[i].set_ylabel("gRNA fold-change (stimulated / unstimulated)")
xxx = pd.DataFrame(fc)
else:
xxx = xxx.join(fc, how="outer")
colors = pd.DataFrame()
colors[sns.color_palette("colorblind")[0]] = fc.index.str.contains("Wnt")
colors[sns.color_palette("colorblind")[1]] = fc.index.str.contains("CTRL")
colors[sns.color_palette("colorblind")[2]] = fc.index.str.contains("Tcr")
colors[sns.color_palette("colorblind")[3]] = fc.index.str.contains("Ess")
colors = colors.apply(lambda j: j[j].index.tolist()[0], axis=1).tolist()
axis[i].scatter(fc.rank(ascending=False, method="first"), fc, color=colors, alpha=0.5)
axis[i].axhline(y=0, color='black', linestyle='--', lw=0.5)
axis[i].set_title(re.sub("_stimulated", "", screens[i + 1]))
axis[i].set_xlabel("gRNA rank (stimulated / unstimulated)")
sns.despine(fig)
fig.savefig(os.path.join(results_dir, "gRNA_counts.norm.{}.rank.diff_condition.svg".format(prefix)), bbox_inches="tight")
xxx.columns = xxx.columns.str.extract("(.*)_stimulated")
xxx.to_csv(os.path.join(results_dir, "gRNA_counts.norm.{}.rank.diff_condition.csv".format(prefix)), index=True)