stats.py 文件源码-python代码片段

def correct_bc_error(bc_confidence_threshold, seq, qual, wl_dist):
    '''Attempt to correct an incorrect BC sequence by computing
    the probability that a Hamming distance=1 BC generated
    the observed sequence, accounting for the prior distribution
    of the whitelist barcodes (wl_dist), and the QV of the base
    that must have been incorrect'''

    # QV values
    qvs = np.fromstring(qual, dtype=np.byte) - tk_constants.ILLUMINA_QUAL_OFFSET

    # Char array of read
    a = array.array('c', seq)

    # Likelihood of candidates
    wl_cand = []
    likelihoods = []

    # Enumerate Hamming distance 1 sequences - if a sequence
    # is on the whitelist, compute it's likelihood.
    for pos in range(len(a)):
        existing = a[pos]
        for c in tk_seq.NUCS:
            if c == existing:
                continue
            a[pos] = c
            test_str = a.tostring()

            # prior probability of this BC
            p_bc = wl_dist.get(test_str)
            if p_bc is not None:
                # probability of the base error
                edit_qv = min(33.0, float(qvs[pos]))
                p_edit = 10.0**(-edit_qv/10.0)
                wl_cand.append(test_str)
                likelihoods.append(p_bc * p_edit)

        a[pos] = existing

    posterior = np.array(likelihoods)
    posterior /= posterior.sum()
    if len(posterior) > 0:
        pmax = posterior.max()
        if pmax > bc_confidence_threshold:
            return wl_cand[np.argmax(posterior)]

    return None