def correct_bc_error(bc_confidence_threshold, seq, qual, wl_dist):
'''Attempt to correct an incorrect BC sequence by computing
the probability that a Hamming distance=1 BC generated
the observed sequence, accounting for the prior distribution
of the whitelist barcodes (wl_dist), and the QV of the base
that must have been incorrect'''
# QV values
qvs = np.fromstring(qual, dtype=np.byte) - tk_constants.ILLUMINA_QUAL_OFFSET
# Char array of read
a = array.array('c', seq)
# Likelihood of candidates
wl_cand = []
likelihoods = []
# Enumerate Hamming distance 1 sequences - if a sequence
# is on the whitelist, compute it's likelihood.
for pos in range(len(a)):
existing = a[pos]
for c in tk_seq.NUCS:
if c == existing:
continue
a[pos] = c
test_str = a.tostring()
# prior probability of this BC
p_bc = wl_dist.get(test_str)
if p_bc is not None:
# probability of the base error
edit_qv = min(33.0, float(qvs[pos]))
p_edit = 10.0**(-edit_qv/10.0)
wl_cand.append(test_str)
likelihoods.append(p_bc * p_edit)
a[pos] = existing
posterior = np.array(likelihoods)
posterior /= posterior.sum()
if len(posterior) > 0:
pmax = posterior.max()
if pmax > bc_confidence_threshold:
return wl_cand[np.argmax(posterior)]
return None
评论列表
文章目录