python类Samfile()的实例源码

bakH.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.bam = pysam.Samfile(bamName)
        if referenceName is not None:
            self.reference = pysam.Fastafile(referenceName)
        else:
            self.reference = None
        self.honH5 = honH5
        #self.args = args
        #self.kwargs = kwargs
bakH.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __call__(self, bam, reference, honH5):
        """
        Takes a pysam.Samfile
        """
        logging.info("Starting %s" % (self.name))
        size = self.end - self.start
        regName =  "%s:%d-%d" % (self.chrom, self.start, self.end)

        logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))

        logging.debug("Parsing bam" )
        st = max(0, self.start)
        readCount = bam.count(self.chrom, st, self.end)
        reads = bam.fetch(self.chrom, st, self.end)
        if readCount == 0:
            logging.warning("No reads found in %s" % self.groupName)
            self.failed = True
            self.errMessage = "No reads found in %s" % self.groupName
            return
        else:
            logging.info("%d reads to parse in %s" % (readCount, self.groupName))

        myData = self.countErrors(reads, self.start, size, self.args)

        #request loc on honH5 and flush results
        with honH5.acquireH5('a') as h5dat:
            out = h5dat.create_group(self.groupName)

            out.attrs["reference"] = self.chrom
            out.attrs["start"] = self.start
            out.attrs["end"] = self.end

            if size < CHUNKSHAPE[1]:
                chunk = (CHUNKSHAPE[0], size-1)
            else:
                chunk = CHUNKSHAPE

            container = out.create_dataset("data", data = myData, \
                                    chunks=chunk, compression="gzip")
            h5dat.flush()
HSpots.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __call__(self, bam, reference, honH5):
        """
        Takes a pysam.Samfile
        """
        logging.info("Starting %s" % (self.name))
        size = self.end - self.start
        regName =  "%s:%d-%d" % (self.chrom, self.start, self.end)

        logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))

        logging.debug("Parsing bam" )
        st = max(0, self.start)
        readCount = bam.count(self.chrom, st, self.end)
        reads = bam.fetch(self.chrom, st, self.end)
        if readCount == 0:
            logging.warning("No reads found in %s" % self.groupName)
            self.failed = True
            self.errMessage = "No reads found in %s" % self.groupName
            return
        else:
            logging.info("%d reads to parse in %s" % (readCount, self.groupName))

        myData = self.countErrors(reads, self.start, size, self.args)

        #request loc on honH5 and flush results
        with honH5.acquireH5('a') as h5dat:
            out = h5dat.create_group(self.groupName)

            out.attrs["reference"] = self.chrom
            out.attrs["start"] = self.start
            out.attrs["end"] = self.end

            if size < CHUNKSHAPE[1]:
                chunk = (CHUNKSHAPE[0], size-1)
            else:
                chunk = CHUNKSHAPE

            container = out.create_dataset("data", data = myData, \
                                    chunks=chunk, compression="gzip")
            h5dat.flush()
bampie.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def mapTails(bam, args):
    if bam.endswith('.bam'):
        bam = pysam.Samfile(bam,'rb')
    elif bam.endswith('.sam'):
        bam = pysam.Samfile(bam)
    else:
        logging.error("Cannot open input file! %s" % (bam))
        exit(1)

    logging.info("Extracting tails")
    tailfastq = tempfile.NamedTemporaryFile(suffix=".fastq", delete=False, dir=args.temp)
    tailfastq.close(); tailfastq = tailfastq.name
    r, t, m = extractTails(bam, outFq=tailfastq, minLength=args.minTail)

    logging.info("Parsed %d reads" % (r))
    logging.info("Found %d tails" % (t))
    logging.info("%d reads had double tails" % (m))
    if t == 0:
        logging.info("No tails -- Exiting")
        exit(0)

    tailmap = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, dir=args.temp)
    tailmap.close(); tailmap = tailmap.name

    callBlasr(tailfastq, args.reference, args.params, args.nproc, tailmap)
    bam.close() #reset
    return tailmap
HSpots.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.bam = pysam.Samfile(bamName)
        if referenceName is not None:
            self.reference = pysam.Fastafile(referenceName)
        else:
            self.reference = None
        self.honH5 = honH5
        #self.args = args
        #self.kwargs = kwargs
HSpots.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __call__(self, bam, reference, honH5):
        """
        Takes a pysam.Samfile
        """
        logging.info("Starting %s" % (self.name))
        size = self.end - self.start
        regName =  "%s:%d-%d" % (self.chrom, self.start, self.end)

        logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))

        logging.debug("Parsing bam" )
        st = max(0, self.start)
        readCount = bam.count(self.chrom, st, self.end)
        reads = bam.fetch(self.chrom, st, self.end)
        if readCount == 0:
            logging.warning("No reads found in %s" % self.groupName)
            self.failed = True
            self.errMessage = "No reads found in %s" % self.groupName
            return
        else:
            logging.info("%d reads to parse in %s" % (readCount, self.groupName))

        myData = self.countErrors(reads, self.start, size, self.args)

        #request loc on honH5 and flush results
        with honH5.acquireH5('a') as h5dat:
            out = h5dat.create_group(self.groupName)

            out.attrs["reference"] = self.chrom
            out.attrs["start"] = self.start
            out.attrs["end"] = self.end

            if size < CHUNKSHAPE[1]:
                chunk = (CHUNKSHAPE[0], size-1)
            else:
                chunk = CHUNKSHAPE

            container = out.create_dataset("data", data = myData, \
                                    chunks=chunk, compression="gzip")
            h5dat.flush()
CoverView.py 文件源码 项目:OpEx 作者: RahmanTeam 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def __init__(self, threadidx, options, config, startline, endline, names):
        multiprocessing.Process.__init__(self)

        # Initializing variables
        self.threadidx = threadidx
        self.options = options
        self.config = config
        self.startline = startline
        self.endline = endline
        self.names = names

        # Initializing reads directory
        self.reads = dict()

        # Connecting to BAM file
        self.samfile = pysam.Samfile(options.input, "rb")

        # Connecting to transcript database file
        if not config['transcript_db'] is None:
            self.enstdb = pysam.Tabixfile(config['transcript_db'])
        else:
            self.enstdb = None

        # Initializing output files
        if int(options.threads) > 1:
            if config['outputs']['regions']:
                self.out_targets = open(options.output + '_regions_tmp_' + str(threadidx) + '.txt', 'w')
            if config['outputs']['profiles']:
                self.out_profiles = open(options.output + '_profiles_tmp_' + str(threadidx) + '.txt', 'w')
                if not config['transcript_db'] is None:
                    self.out_poor = open(options.output + '_poor_tmp_' + str(threadidx) + '.txt', 'w')
        else:
            if config['outputs']['regions']:
                self.out_targets = open(options.output + '_regions.txt', 'w')
            if config['outputs']['profiles']:
                self.out_profiles = open(options.output + '_profiles.txt', 'w')
                if not config['transcript_db'] is None:
                    self.out_poor = open(options.output + '_poor.txt', 'w')

    # Checking if target is fail or pass
insertion.py 文件源码 项目:workflow 作者: hmkim 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def igv_reads(bamfile, virus_contig):
    igv_chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.igv.bam"
    if os.path.exists(igv_chimeric_file):
        return igv_chimeric_file
    with pysam.Samfile(args.bamfile, "rb") as in_handle, \
         pysam.Samfile(igv_chimeric_file, "wb", template=in_handle) as out_handle:
        contig = in_handle.gettid(args.virus_contig)
        for read in in_handle:
            if skip_read(read, contig, args.virus_contig, in_handle, False):
                continue
            chrom = in_handle.getrname(read.tid)
            out_handle.write(read)
    return igv_chimeric_file
insertion.py 文件源码 项目:workflow 作者: hmkim 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def keep_chimeric_reads(bamfile, virus_contig):
    chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.bam"
    if os.path.exists(chimeric_file):
        return chimeric_file
    with pysam.Samfile(args.bamfile, "rb") as in_handle, \
         pysam.Samfile(chimeric_file, "wb", template=in_handle) as out_handle:
        contig = in_handle.gettid(args.virus_contig)
        for read in in_handle:
            if skip_read(read, contig, args.virus_contig, in_handle, True):
                continue
            chrom = in_handle.getrname(read.tid)
            out_handle.write(read)
    return chimeric_file
orientation.py 文件源码 项目:workflow 作者: hmkim 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_duplicates(bam_file):
    s = set()
    with pysam.Samfile(bam_file, "rb") as in_handle:
        for read in in_handle:
            if read.is_duplicate:
                s.update([read.qname])
    return s
chimeric.py 文件源码 项目:workflow 作者: hmkim 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_duplicates(bam_file):
    s = set()
    with pysam.Samfile(bam_file, "rb") as in_handle:
        for read in in_handle:
            if read.is_duplicate:
                s.update([read.qname])
    return s
utils.py 文件源码 项目:cellranger 作者: 10XGenomics 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def concatenate_and_fix_bams(out_bamfile, bamfiles, drop_tags=None):
    """Concatenate bams with different references and populate tags.

    Assumes that the input bams have completely non-overlapping reference entries.
    No sorting assumed. Output reads as in the same order as input.
    Tags are stripped from the header (which is assumed to look like an AugmentedFastqHeader)
    and are stored as bam tags. Tags in drop_tags are completely dropped.

    Args:
    - drop_tags: Set of tag names to ignore. If None, than all tags will be included.
    """
    template_bam = pysam.Samfile(bamfiles[0])
    header = template_bam.header

    for bam_fn in bamfiles[1:]:
        bam = pysam.Samfile(bam_fn)
        header['SQ'].extend(bam.header['SQ'])

    refname_to_tid = { ref_head['SN']:idx for (idx, ref_head) in enumerate(header['SQ']) }

    bam_out = pysam.Samfile(out_bamfile, "wb", header=header)

    for bam_fn in bamfiles:
        bam = pysam.Samfile(bam_fn)

        for rec in bam:
            if rec.is_unmapped and rec.mate_is_unmapped:
                rec.reference_id = -1
                rec.next_reference_id = -1
            elif rec.is_unmapped and not rec.mate_is_unmapped:
                rec.next_reference_id = refname_to_tid[bam.references[rec.next_reference_id]]
                rec.reference_id = rec.next_reference_id
            elif not rec.is_unmapped and rec.mate_is_unmapped:
                rec.reference_id = refname_to_tid[bam.references[rec.reference_id]]
                rec.next_reference_id = rec.reference_id
            else:
                rec.reference_id = refname_to_tid[bam.references[rec.reference_id]]
                rec.next_reference_id = refname_to_tid[bam.references[rec.next_reference_id]]

            # Pull fields out of qname, and make tags, writing out to bam_out
            qname_split = rec.qname.split(cr_fastq.AugmentedFastqHeader.TAG_SEP)
            rec.qname = qname_split[0]

            tags = rec.tags
            for i in range(1, len(qname_split), 2):
                tag = (qname_split[i], qname_split[i+1])
                # Don't add empty tags
                if len(tag[1]) > 0 and (drop_tags is None or not tag[0] in drop_tags):
                    tags.append(tag)

            rec.tags = tags
            bam_out.write(rec)
realigner.bak.py 文件源码 项目:CLAM 作者: Xinglab 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def realigner(out_dir, tmp_dir, winsize=50, unstranded=False):
    """DOCSTRING
    Args:
    Returns:
    """
    # file handlers
    mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb')
    ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
    obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam)
    chr_list=[x['SN'] for x in ubam.header['SQ']]

    # construct the mread_dict; this will be needed throughout
    mread_dict = defaultdict(list)
    for alignment in mbam:
        mread_dict[alignment.qname].append(alignment)

    # keep a record of processed reads
    processed_mreads = set()

    # iterate through all mreads
    for read_qname in mread_dict:
        if read_qname in processed_mreads:
            continue

        ## construct the fully-connected subgraph for each read
        read_to_locations, processed_mreads = \
            construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded)
        subgraph = set()
        for read in read_to_locations:
            _ = map(subgraph.add, read_to_locations[read].keys())
        subgraph = list(subgraph)

        ## build the BIT tracks
        node_track, multi_reads_weights = \
            construct_BIT_track(subgraph, read_to_locations, ubam, unstranded)

        ## run EM
        multi_reads_weights = \
            run_EM(node_track, multi_reads_weights, w=winsize)

        ## write to obam
        for read in multi_reads_weights:
            for node in multi_reads_weights[read]:
                alignment = read_to_locations[read][node]
                score = round(multi_reads_weights[read][node][0], 3)
                alignment.set_tag('AS', score)
                alignment.set_tag('PG', 'CLAM')
                obam.write(alignment)
    # sort the final output
    logger.info('sorting output')
    obam.close()
    ubam.close()
    mbam.close()
    obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam')
    pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam'))
    pysam.index(obam_sorted_fn)
    os.remove(os.path.join(out_dir, 'realigned.bam'))
    return
peakcaller.bak2.py 文件源码 项目:CLAM 作者: Xinglab 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
    """DOCSTRING
    Args:
    Returns:
    """
    # file handlers
    mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
    ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
    bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
    if unique_only:
        ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
    else:
        ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')

    # read in GTF
    logger.info('read gtf from "%s" '% fn)
    gene_annot = read_gtf(gtf_fp)

    # fetch tag counts in each gene
    ##  gene_name: { 'ip': bin_interval_ip, 'con': bin_interval_con }
    logger.info('reading gene counts')
    gene_count = defaultdict(dict)
    for gene_name in tqdm(gene_annot):
        gene = gene_annot[gene_name]
        bin_interval_ip, bin_interval_con = \
            gene_to_count(bam_dict, gene, 
                unique_only=unique_only, with_control=with_control, 
                unstranded=unstranded)
        if bin_interval_ip is None:  ## if no IP reads in this gene
            continue
        gene_count[gene_name]['ip'] = bin_interval_ip
        gene_count[gene_name]['con'] = bin_interval_con

    # estimate global overdispersion param. for each dataset
    logger.info('estimating dispersion parameter')
    alpha_ip_vec = estim_dispersion_param(len(bam_dict['ubam.ip']), gene_count, 'ip' )
    if with_control:
        alpha_con_vec = estim_dispersion_param(len(bam_dict['ubam.con']), gene_count, 'con' )
    else:
        alpha_con_vec = np.asarray(alpha_ip_vec)

    # perform statistical test
    logger.info('calling peaks')
    for gene_name in gene_to_count: 
        gene = gene_annot[gene_name]
        BED = test_gene_bin(gene, gene_count[gene_name]['ip'], gene_count[gene_name]['con'],
            alpha_ip_vec, alpha_con_vec)
        ofile.write(BED)
        peak_counter += len(BED.split('\n'))
    ofile.close()
    logger.info('called %i peaks'%peak_counter)

    return
geneBody_coverage.py 文件源码 项目:RSeQC 作者: MonashBioinformaticsPlatform 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def genebody_coverage(bam, position_list):
    '''
    position_list is dict returned from genebody_percentile
    position is 1-based genome coordinate
    '''
    samfile = pysam.Samfile(bam, "rb")
    aggreagated_cvg = collections.defaultdict(int)

    gene_finished = 0
    for chrom, strand, positions in position_list.values():
        coverage = {}
        for i in positions:
            coverage[i] = 0.0
        chrom_start = positions[0]-1
        if chrom_start <0: chrom_start=0
        chrom_end = positions[-1]
        try:
            samfile.pileup(chrom, 1,2)
        except:
            continue

        for pileupcolumn in samfile.pileup(chrom, chrom_start, chrom_end, truncate=True):
            ref_pos = pileupcolumn.pos+1
            if ref_pos not in positions:
                continue
            if pileupcolumn.n == 0:
                coverage[ref_pos] = 0
                continue                
            cover_read = 0
            for pileupread in pileupcolumn.pileups:
                if pileupread.is_del: continue
                if pileupread.alignment.is_qcfail:continue 
                if pileupread.alignment.is_secondary:continue 
                if pileupread.alignment.is_unmapped:continue
                if pileupread.alignment.is_duplicate:continue
                cover_read +=1
            coverage[ref_pos] = cover_read
        tmp = [coverage[k] for k in sorted(coverage)]
        if strand == '-':
            tmp = tmp[::-1]
        for i in range(0,len(tmp)):
            aggreagated_cvg[i] += tmp[i]
        gene_finished += 1

        if gene_finished % 100 == 0:
            print >>sys.stderr, "\t%d transcripts finished\r" % (gene_finished),
    return  aggreagated_cvg
bam.py 文件源码 项目:mirtop 作者: miRTop 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def read_bam(bam_fn, precursors, clean = True):
    """
    read bam file and perform realignment of hits
    """
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    mode = "r" if bam_fn.endswith("sam") else "rb"
    handle = pysam.Samfile(bam_fn, mode)
    reads = defaultdict(hits)
    for line in handle:
        if line.reference_id < 0:
            logger.debug("Sequence not mapped: %s" % line.reference_id)
            continue
        query_name = line.query_name
        if query_name not in reads and line.query_sequence == None:
            continue
        if line.query_sequence and line.query_sequence.find("N") > -1:
            continue
        if query_name not in reads:
            reads[query_name].set_sequence(line.query_sequence)
            reads[query_name].counts = _get_freq(query_name)
        if line.is_reverse:
            logger.debug("Sequence is reverse: %s" % line.query_name)
            continue
        chrom = handle.getrname(line.reference_id)
        #  print "%s %s %s %s" % (line.query_name, line.reference_start, line.query_sequence, chrom)
        cigar = line.cigartuples
        iso = isomir()
        iso.align = line
        iso.set_pos(line.reference_start, len(reads[query_name].sequence))
        logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end))
        if len(precursors[chrom]) < line.reference_start + len(reads[query_name].sequence):
            continue
        iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], line.reference_start, cigar)
        logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
        if len(iso.subs) < 2:
            reads[query_name].set_precursor(chrom, iso)
    logger.info("Hits: %s" % len(reads))
    if clean:
        reads = filter.clean_hits(reads)
        logger.info("Hits after clean: %s" % len(reads))
    return reads
filterUniqueBam.py 文件源码 项目:NGaDNAP 作者: theboocock 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def main(argv):
    args = parse_args(argv)
    print args
    if args.input == "-" and sys.stdin.isatty():
        sys.stderr.write("STDIN is a terminal, terminating!\n")
        return 1
    elif sys.stdout.isatty():
        sys.stderr.write("STDOUT is a terminal, terminating!\n")
        return 1

    with pysam.Samfile(args.input, "rb") as infile:
        with pysam.Samfile("-", "wb", template=infile) as outfile:
            curpos = None
            curvariants = {}
            for (read_num, read) in enumerate(infile):
                if curpos and ((read.tid, read.pos) != curpos):
                    # Sort order is defined as ascending 'tid's and positions
                    if curpos > (read.tid, read.pos) and not read.is_unmapped:
                        sys.stderr.write("ERROR: Input file does not appear "
                                         "to be sorted by coordinates at "
                                         "record %i, aborting ...\n"
                                         % (read_num,))
                        return 1

                    _flush_buffer(outfile, curvariants,
                                  args.remove_duplicates)
                    curpos = None

                is_filtered = read.flag & _FILTERED_FLAGS
                is_collapsed = read.qname.startswith("M_")
                if is_filtered or not (read.qual and is_collapsed):
                    outfile.write(read)
                    continue

                curpos = (read.tid, read.pos)
                nkey = (read.is_reverse, read.pos, read.alen)
                if nkey in curvariants:
                    curvariants[nkey][0].append(read)
                    curvariants[nkey][1] += 1
                else:
                    curvariants[nkey] = [[read], 1]

            _flush_buffer(outfile, curvariants, args.remove_duplicates)

    return 0
hints_db.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def setup_hints(job, input_file_ids):
    """
    Generates hints for a given genome with a list of BAMs. Will add annotation if it exists.
    """
    # RNA-seq hints
    filtered_bam_file_ids = {'BAM': collections.defaultdict(list), 'INTRONBAM': collections.defaultdict(list)}
    for dtype, bam_dict in input_file_ids['bams'].iteritems():
        if len(bam_dict) == 0:
            continue
        # Since BAMs are valid, we can assume that they all share the same header
        bam_file_id, bai_file_id = bam_dict.values()[0]
        bam_path = job.fileStore.readGlobalFile(bam_file_id)
        sam_handle = pysam.Samfile(bam_path)
        # triple disk usage to deal with name sorted bam
        disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id]) * 3
        # generate reference grouping that will be used downstream until final cat step
        grouped_references = [tuple(x) for x in group_references(sam_handle)]
        for original_path, (bam_file_id, bai_file_id) in bam_dict.iteritems():
            for reference_subset in grouped_references:
                j = job.addChildJobFn(namesort_bam, bam_file_id, bai_file_id, reference_subset, disk_usage,
                                      disk=disk_usage, cores=4, memory='16G')
                filtered_bam_file_ids[dtype][reference_subset].append(j.rv())

    # IsoSeq hints
    iso_seq_hints_file_ids = []
    iso_seq_file_ids = input_file_ids['iso_seq_bams']
    if len(iso_seq_file_ids) > 0:
        for bam_file_id, bai_file_id in iso_seq_file_ids:
            disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id])
            j = job.addChildJobFn(generate_iso_seq_hints, bam_file_id, bai_file_id, disk=disk_usage)
            iso_seq_hints_file_ids.append(j.rv())

    # protein hints
    if input_file_ids['protein_fasta'] is not None:
        disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['protein_fasta'])
        j = job.addChildJobFn(generate_protein_hints, input_file_ids['protein_fasta'], input_file_ids['genome_fasta'],
                              disk=disk_usage)
        protein_hints_file_id = j.rv()
    else:
        protein_hints_file_id = None

    # annotation hints
    if input_file_ids['annotation'] is not None:
        disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['annotation'])
        j = job.addChildJobFn(generate_annotation_hints, input_file_ids['annotation'], disk=disk_usage)
        annotation_hints_file_id = j.rv()
    else:
        annotation_hints_file_id = None
    return job.addFollowOnJobFn(merge_bams, filtered_bam_file_ids, annotation_hints_file_id,
                                iso_seq_hints_file_ids, protein_hints_file_id).rv()
hints_db.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def namesort_bam(job, bam_file_id, bai_file_id, reference_subset, disk_usage, num_reads=50 ** 6):
    """
    Slices out the reference subset from a BAM, name sorts that subset, then chunks the resulting reads up for
    processing by filterBam.
    """
    def write_bam(r, ns_handle):
        """Write to the path, returns file ID"""
        outf = tools.fileOps.get_tmp_toil_file()
        outf_h = pysam.Samfile(outf, 'wb', template=ns_handle)
        for rec in r:
            outf_h.write(rec)
        outf_h.close()
        return job.fileStore.writeGlobalFile(outf)

    bam_path = job.fileStore.readGlobalFile(bam_file_id)
    is_paired = bam_is_paired(bam_path)
    job.fileStore.readGlobalFile(bai_file_id, bam_path + '.bai')
    name_sorted = tools.fileOps.get_tmp_toil_file(suffix='name_sorted.bam')
    cmd = [['samtools', 'view', '-b', bam_path] + list(reference_subset),
           ['sambamba', 'sort', '-t', '4', '-m', '15G', '-o', '/dev/stdout', '-n', '/dev/stdin']]
    tools.procOps.run_proc(cmd, stdout=name_sorted)
    ns_handle = pysam.Samfile(name_sorted)
    # this group may come up empty -- check to see if we have at least one mapped read
    try:
        _ = ns_handle.next()
    except StopIteration:
        return None
    # reset file handle to start
    ns_handle = pysam.Samfile(name_sorted)
    filtered_file_ids = []
    r = []
    for qname, reads in itertools.groupby(ns_handle, lambda x: x.qname):
        r.extend(list(reads))
        if len(r) >= num_reads:
            file_id = write_bam(r, ns_handle)
            j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G')
            filtered_file_ids.append(j.rv())
            r = []
    # do the last bin, if its non-empty
    if len(r) > 0:
        file_id = write_bam(r, ns_handle)
        j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G')
        filtered_file_ids.append(j.rv())
    return job.addFollowOnJobFn(merge_filtered_bams, filtered_file_ids, disk=disk_usage, memory='16G').rv()
bakH.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test(argv):
    numpy.seterr(all="ignore")
    args = parseArgs(argv)
    setupLogging(True)#keep debug on.. you're testing!
    logging.critical(("Running HSpots.py directly implements testing mode. "
                      "If you're trying to run the full, actual program, use "
                      "Honey.py spots"))

    bam = pysam.Samfile(args.bam)
    reference = pysam.Fastafile(args.reference)
    try:
        if bam.header["HD"]["SO"] != "coordinate":
            logging.warning("BAM is not sorted by coordinates! Performance may be slower")
    except KeyError:
        logging.warning("Assuming BAM is sorted by coordinate. Be sure this is correct")
    logging.info("Running in test mode")

    #do what you will.. from here
    #spot = SpotResult(chrom='11', start=2215290, end=2215798, svtype="DEL", size=208)
    #spot = SpotResult(chrom='22', start=45964261, end=45965596, svtype="DEL", size=-1)
    # This is what I need to start with
    #spot = SpotResult(chrom="22", start=45963975, end=45964532, svtype="DEL", size=57)
    fh = open("honeymissing.bed")
    for line in fh.readlines():
        data = line.strip().split('\t')
        spot = SpotResult(chrom=data[0], start=int(data[1]), end = int(data[2]), \
                          size=int(data[3].split('=')[-1]), svtype="DEL")

        j = SpotCaller('group', spot.chrom, spot.start, spot.end, args)
        if j.supportingReadsFilter(spot, bam, args):
            consen = ConsensusCaller(spot, args)
            consen(bam, reference, 'none')
            for i in consen.newSpots:
                i.tags["seqmade"] = True
                print i
            if len(consen.newSpots) == 0:
                spot.tags["noseq"] = True
                print str(spot)
        else:
            spot.tags["filtfail"] = True
            print str(spot)

    #done with test code
    logging.info("Finished testing")


问题


面经


文章

微信
公众号

扫码关注公众号