tview.py 文件源码-python代码片段

def bam_pileup_to_hdf(hdf, bams, ref_fasta, ref_name:str=None, start:int=0, end:int=None, truth_bam=None,
                      chunk_len=50000, overlap=1000):
    """Create an .hdf file containing chunks of a pileup.

    :param hdf: output .hdf file (will be overwritten).
    :param bams: iterable of input .bam files.
    :param ref_fasta: input .fasta file.
    :param ref_name: name of reference within .bam to parse.
    :param start: start reference coordinate.
    :param end: end reference coordinate. If `None` the full extent of
        the reference will be parsed.
    :param truth_bam: .bam file of truth aligned to ref to generate labels.
    :param chunk_len: int, chunk length in reference coordinates.
    :param overlap: int, overlap of adjacent chunks in reference coordinates.


    ..note:: Datasets will be name as `{reference}_{start}_{end}`, indicating
        the reference coordinates spanned.

    """
    if ref_name is None:
        # use all references
        refs = pysam.FastaFile(ref_fasta).references
        assert end == None
        assert start == 0
    else:
        refs = [ref_name,]

    with h5py.File(hdf, 'w') as hdf:
        for ref in refs:
            if truth_bam is not None:
                gen = generate_training_chunks(truth_bam, bams, ref_fasta, ref_name=ref,
                                               start=start, end=end, chunk_len=chunk_len, overlap=overlap)
            else:
                gen = generate_pileup_chunks(bams, ref_fasta, ref_name=ref, start=start, end=end,
                                             chunk_len=chunk_len, overlap=overlap)
            logger.info("Processing reference {}.".format(ref))
            for c in gen:
                pos = c.pileups[0].positions
                chunk_str = '{}_{}_{}'.format(ref, pos['major'][0], pos['major'][-1] + 1)
                if c.labels is not None:
                    # save labels
                    hdf['{}/labels'.format(chunk_str)] = np.char.encode(c.labels)
                if c.ref_seq is not None:
                    # save ref
                    hdf['{}/ref_seq'.format(chunk_str)] = np.char.encode(c.ref_seq)

                for bam_count, p in enumerate(c.pileups):
                    assert p.ref_name == ref
                    grp = '{}/datatype{}'.format(chunk_str, bam_count)
                    hdf['{}/positions'.format(grp)] = p.positions
                    hdf[grp].attrs['rname'] = p.ref_name
                    hdf[grp].attrs['start'] = p.positions['major'][0]
                    hdf[grp].attrs['end'] = p.positions['major'][-1]
                    hdf[grp].attrs['bam'] = p.bam
                    hdf['{}/data'.format(grp)] = p.reads