def find_consensus_values(
x,
n_expected=None,
bandwidth_auto=True,
min_bandwidth=0.02, # XXX This is GC-MS specific
default_bandwidth=0.1, # XXX This too
show_plot=False,
out=sys.stdout,
err=sys.stderr):
"""
Use kernel density estimation to analyze the distribution of data points
along the X-axis, and identify consensus values for major clusters. This
is used to identify common peaks in a set of related GC-MS samples.
"""
if isinstance(x, list):
x = np.array(x)
x_grid = np.linspace(x.min() - 0.25, x.max() + 0.25, 1000)
if bandwidth_auto:
# http://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/
grid = GridSearchCV(KernelDensity(),
{'bandwidth': np.linspace(min_bandwidth, 0.1, 30)},
cv=20) # 20-fold cross-validation
grid.fit(x[:, None])
bandwidth = grid.best_params_['bandwidth']
print("Best bandwidth: %.4f" % bandwidth, file=err)
kde = grid.best_estimator_
pdf = np.exp(kde.score_samples(x_grid[:, None]))
else:
bandwidth = default_bandwidth
kde = gaussian_kde(x)
pdf = kde.evaluate(x_grid)
i_maxima = local_maxima(x_grid, pdf)
max_values = []
for i_max in i_maxima:
max_values.append((x_grid[i_max], pdf[i_max]))
# sort maxima by value in distribution
max_values.sort(key=lambda x: x[1], reverse=True)
pdf_max = pdf.max()
major_peaks, minor_peaks = extract_peaks(max_values, pdf_max, n_expected=n_expected,
out=out, err=err)
# now sort major peaks by retention time
major_peaks.sort(key=lambda x: x[0])
print("Major retention time peaks:", file=err)
for i_peak, (xval, pdf_val) in enumerate(major_peaks):
print(" %2d %8.3f" % (i_peak+1, xval), file=err)
if show_plot:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
ax.hist(x, 50, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
for rt, pdf_val in major_peaks:
ax.axvline(rt, color='red')
ax.axvline(rt-bandwidth, color='magenta')
ax.axvline(rt+bandwidth, color='magenta')
plt.show()
return [xval for xval, yval in major_peaks], float(bandwidth)
评论列表
文章目录