python类Samfile()的实例源码

ancient_filter.py 文件源码 项目:NGaDNAP 作者: theboocock 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def filter_bam(input_file, output_file, downweight_number, ctot, gtoa):
    """
        Takes a bam file as input and weights the quality of the reads down.

        Need to ensure we write the header out :)

        Investigate pysam and look for a header,
        this should really help us understand how to get this bam filter working 
        and writing the bam files directly back out to the terminal.
    """
    bam = pysam.Samfile(input_file,'rb')
    bam_out = pysam.Samfile(output_file, 'wb',template=bam) 
    for line in bam:
        change_bases_c = None
        change_bases_t = None
        seq = line.seq
        qual = line.qual
        if(ctot):
            change_bases_c = [check_c_2_t(nuc) and i < downweight_number for i, nuc in enumerate(seq)]
        if(gtoa):
            change_bases_t = [check_g_2_a(nuc) and (len(seq)-i) <= downweight_number for i, nuc in enumerate(seq)]
        new_qual = downweight_quality(qual,change_bases_c, change_bases_t)
        line.qual = new_qual
        bam_out.write(line)
hints_db.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def validate_bam_fasta_pairs(bam_path, fasta_sequences, genome):
    """
    Make sure that this BAM is actually aligned to this fasta. Every sequence should be the same length. Sequences
    can exist in the reference that do not exist in the BAM, but not the other way around.
    """
    handle = pysam.Samfile(bam_path, 'rb')
    bam_sequences = {(n, s) for n, s in zip(*[handle.references, handle.lengths])}
    difference = bam_sequences - fasta_sequences
    if len(difference) > 0:
        base_err = 'Error: BAM {} has the following sequence/length pairs not found in the {} fasta: {}.'
        err = base_err.format(bam_path, genome, ','.join(['-'.join(map(str, x)) for x in difference]))
        raise UserException(err)
    missing_seqs = fasta_sequences - bam_sequences
    if len(missing_seqs) > 0:
        base_msg = 'BAM {} does not have the following sequence/length pairs in its header: {}.'
        msg = base_msg.format(bam_path, ','.join(['-'.join(map(str, x)) for x in missing_seqs]))
        logger.warning(msg)
ctr.py 文件源码 项目:HiCembler 作者: lpryszcz 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def main(bam):

    # get contig2size
    #faidx = FastaIndex(fasta)
    #contig2size = {c: faidx.id2stats[c][0] for c in faidx}
    sam = pysam.Samfile(bam)
    contig2size = {c: s for c, s in zip(sam.references, sam.lengths)}

    # estimate on largest contigs
    longest_contigs = sorted(contig2size, key=lambda x: contig2size[x], reverse=1)
    totdata = [0, 0]
    for c in longest_contigs[:25]:
        data = bam2matches(bam, regions=[c], mapq=10)
        print c, contig2size[c], 1.*data[0] / sum(data), sum(data)
        for i in range(2):
            totdata[i] += data[i]
    data = totdata
    print 1.*data[0] / sum(data), sum(data)
test_simulator.py 文件源码 项目:clrsvsim 作者: color 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def test_make_split_read_bam_file(self):
        sorted_bam = path.join(TEST_DATA_DIR, 'sorted.bam')
        with pysam.Samfile(sorted_bam, 'rb') as samfile:
            for read in samfile:
                if not read.cigarstring:
                    continue

                for breakpoint in (10, 50, 100):
                    if breakpoint >= read.rlen:
                        continue

                    for is_left_split in (True, False):
                        split_read = make_split_read(read, breakpoint, is_left_split)
                        cigar_items = list(Cigar(split_read.cigarstring).items())
                        clipped_item = cigar_items[0] if is_left_split else cigar_items[-1]
                        min_clip_len = breakpoint if is_left_split else read.rlen - breakpoint  # Can be longer if adjacent to another clip.
                        self.assertGreaterEqual(clipped_item[0], min_clip_len)
                        self.assertIn(clipped_item[1], ('S', 'H'))  # Will be soft-clipped unless already hard-clipped.
chimeric.py 文件源码 项目:workflow 作者: hmkim 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def chimeric_reads(bamfile, virus_contig, duplicates):
    igv_chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.igv.bam"
    if os.path.exists(igv_chimeric_file):
        return igv_chimeric_file
    with pysam.Samfile(args.bamfile, "rb") as in_handle, \
         pysam.Samfile(igv_chimeric_file, "wb", template=in_handle) as out_handle:
        for read in in_handle:
            try:
                chrom = in_handle.getrname(read.tid)
            except ValueError:
                continue
            if not is_chimeric_read(read, chrom, virus_contig):
                continue
            if read.qname in duplicates:
                continue
            out_handle.write(read)
    return igv_chimeric_file
__init__.py 文件源码 项目:cellranger 作者: 10XGenomics 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def merge_by_key(bam_filenames, key_func, bam_out):
    file_cache = tk_cache.FileHandleCache(mode='rb', open_func=pysam.Samfile)
    total_reads = 0
    heap  = []

    for bam_filename in bam_filenames:
        try:
            bam = file_cache.get(bam_filename)
            first_read = bam.next()
            heapq.heappush(heap, (key_func(first_read), first_read, bam_filename))
        except StopIteration:
            pass

    while len(heap) > 0:
        # Get the minimum item and write it to the bam.
        key, read, bam_filename = heapq.heappop(heap)
        bam = file_cache.get(bam_filename)
        bam_out.write(read)
        total_reads += 1

        # Get the next read from the source bam we just wrote from
        # If that BAM file is out of reads, then we leave that one out
        try:
            next_read = bam.next()
            heapq.heappush(heap, (key_func(next_read), next_read, bam_filename))
        except StopIteration:
            pass

    return total_reads
bam.py 文件源码 项目:cellranger 作者: 10XGenomics 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def create_bam_outfile(file_name, chrom_names, chrom_lengths, template=None, pgs=None, cos=None, rgs=None, replace_rg=False):
    """ Creates a bam file with given chromosome names and lengths.
    template is an existing bam file.  If it is specified, chrom_names and chrom_lengths
    are ignored. pg is dictionary specifying a 'PG' entry. ID field is required; PN/CL/PP/DS/VN fields are optional.
    rgs is a list of dicts specifiying an 'RG' entry. If replace_rg is True, the existing 'RG' entry is overwritten.
    """
    if template:
        header = template.header
        if pgs is not None:
            for pg in pgs:
                if not header.has_key('PG'):
                    header['PG'] = []
                # add in the PP field based on previous PG entry
                if len(header['PG']) > 0:
                    pp = header['PG'][-1]['ID']
                    if pp is not None:
                        pg['PP'] = pp
                header['PG'].append(pg)

        if cos is not None:
            for co in cos:
                if not header.has_key('CO'):
                    header['CO'] = []
                header['CO'].append(co)

        if rgs is not None:
            if replace_rg and header.has_key('RG') and len(rgs) > 0:
                header['RG'] = []
            for rg in rgs:
                if not header.has_key('RG'):
                    header['RG'] = []
                header['RG'].append(rg)

        bam_file = pysam.Samfile(file_name, 'wb', header=header)
        tids = {name:n for (n, name) in enumerate(template.references)}
    else:
        header = {'SQ': [{'SN': chrom_names[n], 'LN': chrom_lengths[n]} for n in xrange(len(chrom_names))]}
        bam_file = pysam.Samfile(file_name, 'wb', header=header)
        tids = {chrom_names[n]:n for n in xrange(len(chrom_names))}
    return bam_file, tids
bam.py 文件源码 项目:cellranger 作者: 10XGenomics 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def create_bam_infile(file_name):
    bam_file = pysam.Samfile(file_name, 'rb')
    return bam_file
bam.py 文件源码 项目:cellranger 作者: 10XGenomics 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def create_sam_infile(file_name):
    sam_file = pysam.Samfile(file_name, 'r')
    return sam_file
bam.py 文件源码 项目:cellranger 作者: 10XGenomics 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def sort_by_name(file_name, sorted_prefix=None):
    """ Sorts a bam file by the read name, for paired-end
    """
    if sorted_prefix is None:
        sorted_prefix = file_name.replace('.bam', '') + '_namesorted'

    sorted_name = sorted_prefix + '.bam'
    # NOTE -- need to update our internal samtools in order to use pysam.sort
    #pysam.sort('-n', file_name, sorted_prefix)
    subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix])

    return pysam.Samfile(sorted_name, 'rb')
CLAM.lite_aligner.py 文件源码 项目:CLAM 作者: Xinglab 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def filter_multihits(filename, max_hits, verbose, tmp_dir):
    """
    Pre-processing function for cleaning up the input bam file.
    """
    if verbose:
        print_time_stamp('filtering multi-mapped up to %d hits.' % max_hits)
    multiread_set=set()
    subprocess.call("samtools view -h %s | awk '{if($2 !~ /_/ && $3 !~ /_/) {print}}' | samtools view -bS - > %s/filter_random.bam" % (filename, tmp_dir), shell=True)
    oldfile=pysam.Samfile(tmp_dir + '/filter_random.bam','rb')
    new_filename=os.path.abspath(tmp_dir + '/filter%d.bam' % max_hits)
    sorted_filename=os.path.abspath(tmp_dir + '/filter%d.sorted.bam' % max_hits)
    newfile=pysam.Samfile(new_filename, 'wb', template=oldfile)
    for aligned_read in oldfile:
        try:
            if aligned_read.opt("NH") < max_hits:
                newfile.write(aligned_read)
                if aligned_read.opt("NH")>1:
                    multiread_set.add(aligned_read.qname)
        except KeyError:
            newfile.write(aligned_read)
    oldfile.close()
    newfile.close()
    sort_cmd='samtools sort -T %s/ -o %s %s' % (tmp_dir, sorted_filename, new_filename)
    index_cmd='samtools index %s' % sorted_filename
    subprocess.call(sort_cmd, shell=True)
    subprocess.call(index_cmd, shell=True)
    subprocess.call('rm %s/filter_random.bam %s' % (tmp_dir, new_filename), shell=True)
    pickle.dump(multiread_set, open(tmp_dir + '/multiread_set.pdata', 'wb'), -1)
    return(sorted_filename, multiread_set)
peakcaller.bak.py 文件源码 项目:CLAM 作者: Xinglab 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
    """DOCSTRING
    Args:
    Returns:
    """
    # file handlers
    mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
    ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
    bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
    if unique_only:
        ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
    else:
        ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')

    # read in GTF
    gene_annot = read_gtf(gtf_fp)

    # iteratively call peaks in each gene
    peak_counter = 0
    for gene_name in tqdm(gene_annot):
        gene = gene_annot[gene_name]
        BED = call_gene_peak(bam_dict, gene, 
            unique_only=unique_only, with_control=with_control, 
            unstranded=unstranded)
        ofile.write(BED)
        #print BED
        peak_counter += len(BED.split('\n'))
    ofile.close()
    logger.info('called %i peaks'%peak_counter)
    return
peakcaller.py 文件源码 项目:CLAM 作者: Xinglab 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def make_bam_handler_dict(ip_bam_list, con_bam_list):
    """DOCSTRING
    Args
    Returns
    """
    bam_dict = defaultdict(list)
    try:
        ubam_ip, mbam_ip = ip_bam_list
    except:
        ubam_ip = ip_bam_list[0]
        mbam_ip = None
    for bam_fn in ubam_ip.split(','):
        if not os.path.isfile(bam_fn):
            raise Exception('%s not found'%bam_fn)
        bam_dict['ubam.ip'].append( pysam.Samfile(bam_fn, 'rb') )
    if mbam_ip is not None:
        for bam_fn in mbam_ip.split(','):
            if not os.path.isfile(bam_fn):
                raise Exception('%s not found'%bam_fn)
            bam_dict['mbam.ip'].append( pysam.Samfile(bam_fn, 'rb') )

    if con_bam_list is None:
        return bam_dict
    try:
        ubam_con, mbam_con = con_bam_list
    except:
        ubam_con = con_bam_list[0]
        mbam_con = None
    for bam_fn in ubam_con.split(','):
        if not os.path.isfile(bam_fn):
            raise Exception('%s not found'%bam_fn)
        bam_dict['ubam.con'].append( pysam.Samfile(bam_fn, 'rb') )
    if mbam_con is not None:
        for bam_fn in mbam_con.split(','):
            if not os.path.isfile(bam_fn):
                raise Exception('%s not found'%bam_fn)
            bam_dict['mbam.con'].append( pysam.Samfile(bam_fn, 'rb') )
    return bam_dict
CLAM.lite_aligner.py 文件源码 项目:CLAM 作者: Xinglab 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def filter_multihits(filename, max_hits, verbose, tmp_dir):
    """
    Pre-processing function for cleaning up the input bam file.
    """
    if verbose:
        print_time_stamp('filtering multi-mapped up to %d hits.' % max_hits)
    multiread_set=set()
    subprocess.call("samtools view -h %s | awk '{if($2 !~ /_/ && $3 !~ /_/) {print}}' | samtools view -bS - > %s/filter_random.bam" % (filename, tmp_dir), shell=True)
    oldfile=pysam.Samfile(tmp_dir + '/filter_random.bam','rb')
    new_filename=os.path.abspath(tmp_dir + '/filter%d.bam' % max_hits)
    sorted_filename=os.path.abspath(tmp_dir + '/filter%d.sorted.bam' % max_hits)
    newfile=pysam.Samfile(new_filename, 'wb', template=oldfile)
    for aligned_read in oldfile:
        try:
            if aligned_read.opt("NH") < max_hits:
                newfile.write(aligned_read)
                if aligned_read.opt("NH")>1:
                    multiread_set.add(aligned_read.qname)
        except KeyError:
            newfile.write(aligned_read)
    oldfile.close()
    newfile.close()
    sort_cmd='samtools sort -T %s/ -o %s %s' % (tmp_dir, sorted_filename, new_filename)
    index_cmd='samtools index %s' % sorted_filename
    subprocess.call(sort_cmd, shell=True)
    subprocess.call(index_cmd, shell=True)
    subprocess.call('rm %s/filter_random.bam %s' % (tmp_dir, new_filename), shell=True)
    pickle.dump(multiread_set, open(tmp_dir + '/multiread_set.pdata', 'wb'), -1)
    return(sorted_filename, multiread_set)
divide_bam.py 文件源码 项目:RSeQC 作者: MonashBioinformaticsPlatform 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def main():
    usage="%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage,version="%prog " + __version__)
    parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format. BAM file should be sorted and indexed.")
    parser.add_option("-n","--subset-num",action="store",type="int",dest="subset_num",help="Number of small BAM files")
    parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. Output \"Prefix_num.bam\".")
    parser.add_option("-s","--skip-unmap",action="store_true",dest="skip_unmap", help="Skip unmapped reads.")
    (options,args)=parser.parse_args()

    if not (options.input_file and options.subset_num and options.output_prefix):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file):
        print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
        sys.exit(0)     

    samfile = pysam.Samfile(options.input_file,'rb')

    sub_bam = {}
    count_bam={}
    for i in range(0,options.subset_num):
        sub_bam[i] = pysam.Samfile(options.output_prefix + '_' + str(i) +'.bam','wb',template=samfile)
        count_bam[i] = 0

    total_alignment = 0
    print >>sys.stderr, "Dividing " + options.input_file + " ...",
    try:
        while(1):
            aligned_read = samfile.next()
            if aligned_read.is_unmapped and options.skip_unmap is True:
                continue
            total_alignment += 1
            tmp = randrange(0,options.subset_num)
            sub_bam[tmp].write(aligned_read)
            count_bam[tmp] += 1

    except StopIteration:
        print >>sys.stderr, "Done"

    for i in range(0,options.subset_num):
        print "%-55s%d" %  (options.output_prefix + '_' + str(i) +'.bam', count_bam[i])
RNA_fragment_size.py 文件源码 项目:RSeQC 作者: MonashBioinformaticsPlatform 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def main():
    usage="%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage,version="%prog " + __version__)
    parser.add_option("-i","--input",action="store",type="string",dest="input_file",help="Input BAM file") 
    parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in BED format. Must be strandard 12-column BED file. [required]")
    parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
    parser.add_option("-n","--frag-num",action="store",type="int",dest="fragment_num",default=3,help="Minimum number of fragment. default=%default")

    (options,args)=parser.parse_args()

    if not (options.input_file and options.refgene_bed):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file + '.bai'):
        print >>sys.stderr, "cannot find index file of input BAM file"
        print >>sys.stderr, options.input_file + '.bai' + " does not exists"
        sys.exit(0) 

    for file in (options.input_file, options.refgene_bed):
        if not os.path.exists(file):
            print >>sys.stderr, file + " does NOT exists" + '\n'
            sys.exit(0)

    print '\t'.join([str(i) for i in ("chrom","tx_start", "tx_end","symbol","frag_count","frag_mean","frag_median","frag_std")])
    for tmp in fragment_size(options.refgene_bed, pysam.Samfile(options.input_file), options.map_qual, options.fragment_num):
        print tmp
SAM.py 文件源码 项目:RSeQC 作者: MonashBioinformaticsPlatform 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __init__(self,inputFile):
        '''constructor. input could be bam or sam'''
        try:
            self.samfile = pysam.Samfile(inputFile,'rb')
            if len(self.samfile.header) ==0:
                print >>sys.stderr, "BAM/SAM file has no header section. Exit!"
                sys.exit(1)
            self.bam_format = True
        except:
            self.samfile = pysam.Samfile(inputFile,'r')
            if len(self.samfile.header) ==0:
                print >>sys.stderr, "BAM/SAM file has no header section. Exit!"
                sys.exit(1)
            self.bam_format = False
matefixer.py 文件源码 项目:lapels 作者: shunping 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def fixmate(infile, outfile):
    inbam = pysam.Samfile(infile, 'rb')
    outbam = pysam.Samfile(outfile, 'wb', header=inbam.header, 
                           referencenames=inbam.references)
    qname = None    
    nTotal = 0
    nFixed = 0
    count = 0;        
    reads = []    
    gc.disable()    
    for rseq in inbam.fetch(until_eof=True):
        nTotal += 1        
        if qname is None or qname == rseq.qname:
            qname = rseq.qname
            reads.append(rseq)            
        else:            
            count = process(reads, inbam.getrname) 
            if count > 0:
                for r in reads:
                    outbam.write(r)
                nFixed += count                
            qname = rseq.qname
            del reads
            reads = [rseq]
        if nTotal % 200000 == 0:        
            logger.info('%d read(s) fixed' % nTotal)
            gc.enable()
            gc.disable()

    count = process(reads, inbam.getrname)
    if count > 0:
        for r in reads:
            outbam.write(r)
        nFixed += count
    logger.info('%d read(s) processed' % nTotal)
    logger.info('%d read(s) fixed and written to file' % nFixed)

    inbam.close()
    outbam.close()
    return (nTotal, nFixed)
find_good_orfs.py 文件源码 项目:genepred 作者: egorbarsukoff 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def genes_alignments(outdir):
    genes_align = {}
    align_file = pysam.Samfile(outdir+'ORFs_on_genes_align.sam', 'rb')
    for rec in align_file:
        if rec.cigarstring is not None:
            gene_in_contig_len = int(re.findall(r'_len_(\d+)', rec.reference_name)[0])
            total_match = sum([i[1] for i in rec.cigar if i[0] == 0])
            conf = float(re.findall(r'_conf_([0-9]*\.[0-9]*)', rec.reference_name)[0])
            if gene_in_contig_len / total_match > 0.98:
                if genes_align.get(rec.reference_name) is None:
                    genes_align[rec.reference_name] = conf, set()
                genes_align[rec.reference_name][1].add(rec.qname)
    return genes_align
hints_db.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def bam_is_paired(bam_path, num_reads=20000, paired_cutoff=0.75):
    """
    Infers the paired-ness of a bam file.
    """
    sam = pysam.Samfile(bam_path)
    count = 0
    for rec in itertools.islice(sam, num_reads):
        if rec.is_paired:
            count += 1
    if tools.mathOps.format_ratio(count, num_reads) > 0.75:
        return True
    elif tools.mathOps.format_ratio(count, num_reads) < 1 - paired_cutoff:
        return False
    else:
        raise UserException("Unable to infer pairing from bamfile {}".format(bam_path))
misc.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def is_bam(path):
    """Checks if a path is a BAMfile"""
    try:
        pysam.Samfile(path)
    except IOError:
        raise RuntimeError('Path {} does not exist'.format(path))
    except ValueError:
        return False
    return True
implementGain.py 文件源码 项目:bamgineer 作者: pughlab 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def renamereads(inbamfn, outbamfn):

    inbam = pysam.Samfile(inbamfn, 'rb')
    outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)

    paired = {}

    n = 0
    p = 0
    u = 0
    w = 0
    m = 0

    for read in inbam.fetch(until_eof=True):
        n += 1
        if read.is_paired:
            p += 1
            if read.qname in paired:
                uuid = paired[read.qname]
                del paired[read.qname]
                read.qname = uuid
                outbam.write(read)
                w += 1
                m += 1
            else:
                newname = str(uuid4())
                paired[read.qname] = newname
                read.qname = newname
                outbam.write(read)
                w += 1
        else:
            u += 1
            read.qname = str(uuid4())
            outbam.write(read)
            w += 1

    outbam.close()
    inbam.close()
utils.py 文件源码 项目:bamgineer 作者: pughlab 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def renamereads(inbamfn, outbamfn):

    inbam = pysam.Samfile(inbamfn, 'rb')
    outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)

    paired = {}

    n = 0
    p = 0
    u = 0
    w = 0
    m = 0

    for read in inbam.fetch(until_eof=True):
        n += 1
        if read.is_paired:
            p += 1
            if read.qname in paired:
                uuid = paired[read.qname]
                del paired[read.qname]
                read.qname = uuid
                outbam.write(read)
                w += 1
                m += 1
            else:
                newname = str(uuid4())
                paired[read.qname] = newname
                read.qname = newname
                outbam.write(read)
                w += 1
        else:
            u += 1
            read.qname = str(uuid4())
            outbam.write(read)
            w += 1

    outbam.close()
    inbam.close()
utils.py 文件源码 项目:bamgineer 作者: pughlab 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def renamereads(inbamfn, outbamfn):

    inbam = pysam.Samfile(inbamfn, 'rb')
    outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)

    paired = {}
    n = 0;p = 0;u = 0;w = 0;m = 0

    for read in inbam.fetch(until_eof=True):
        n += 1
        if read.is_paired:
            p += 1
            if read.qname in paired:
                uuid = paired[read.qname]
                del paired[read.qname]
                read.qname = uuid
                outbam.write(read)
                w += 1;m += 1
            else:
                newname = str(uuid4())
                paired[read.qname] = newname
                read.qname = newname
                outbam.write(read)
                w += 1
        else:
            u += 1
            read.qname = str(uuid4())
            outbam.write(read)
            w += 1
    outbam.close()
    inbam.close()
methods.py 文件源码 项目:bamgineer 作者: pughlab 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def removeReadsOverlappingHetRegion(inbamfn, bedfn,outbamfn,path):
    print "___ removing reads overlapping heterozygous region ___"
    inbamsorted =  sub('.bam$','.sorted',inbamfn)
    pysam.sort(inbamfn, inbamsorted)
    pysam.index(inbamsorted+'.bam')

    alignmentfile = pysam.AlignmentFile(inbamsorted+'.bam', "rb" )
    outbam = pysam.Samfile(outbamfn, 'wb', template=alignmentfile )

    bedfile = open(bedfn, 'r')

    for bedline in bedfile:
        c = bedline.strip().split()

        if (len(c) == 3 ):
            chr2 = c[0]
            chr = c[0].strip("chr")
            start = int(c[1])
            end   = int(c[2])
        else :
            continue

        try:
            readmappings = alignmentfile.fetch(chr2, start, end)
        except  ValueError as e:
            print("problem fetching the read ")

        for shortread in readmappings:
            try:
                outbam.write(shortread)
            except ValueError as e:
                print ("problem removing read :" + shortread.qname)
    outbamsorted =  sub('.bam$','.sorted',outbamfn)            
    pysam.sort(outbamfn, outbamsorted)
    bamDiff(inbamsorted+'.bam', outbamsorted +'.bam', path )
    outbam.close()
simulate_copy_number_changes.py 文件源码 项目:tHapMix 作者: Illumina 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def pileup(bamfile_name):
   samfile = pysam.Samfile(bamfile_name,"rb")
   output_pileup = open("test.cov", "w")
   sum_cov=0
   for pos_pileup in samfile.pileup():
       sum_cov+=pos_pileup.n
       if not pos_pileup.pos % 1e3:
          av_cov=int(sum_cov/1e3)
          sum_cov=0
          print(pos_pileup.tid,pos_pileup.pos,av_cov,file=output_pileup)
10x_barcode_profiling.py 文件源码 项目:Topsorter 作者: hanfang 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def buildcov(self):
       """
          Builds Barcode counts of the reads aligned to the genome
          input : bam_file and bed_file
          output : Returns bed file with Chr, start, end, barcode, no of occurences, strand
       """
       before = datetime.now()
       barcode_pattern = re.compile(r'BX:(A-Z)?:[ACGT]*-[0-9]')
       self.__LOGGER.info("WARNING: Dropping Reads Alignments that are not barcoded")
       output_bed = open(self.barcode_bed,"wb") 
       with open(self.bed_file) as regions:
             with pysam.Samfile(self.obam_file, "rb") as samfile:
             for line in regions:
            chr, start, end , info = line.rstrip('\n').rstrip('\r').split("\t")
            b_dict = {}
            try:
                        readsinregion=samfile.fetch(chr,int(start),int(end))
                        for read in readsinregion:
                    tags = dict(read.tags)
                    if 'BX' in tags.keys(): 
                    if tags['BX'] in b_dict: b_dict[tags['BX']] += 1
                    else: b_dict[tags['BX']] = 1
                    except:
                        self.__LOGGER.info("Skipping region : {0}:{1}:{2}".format(chr,start,end))
                        continue 
                for k, v in b_dict.items():
                        output_bed.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chr,start,end,info,k,v)) 

       regions.close()
       samfile.close()
       output_bed.close()
       after = datetime.now()
       self.__LOGGER.info('Computed Barcode profiles for the vcf regions completed in {0}'.format(str(after - before)))
sam_utils.py 文件源码 项目:brie 作者: huangyh09 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def load_samfile(sam_file):
    """To load an indexed bam file"""
    ftype = sam_file.split(".")[-1]
    if ftype != "bam" and ftype != "sam":
        print("Error: file type need suffix of bam or sam.")
        sys.exit(1)
    # print("Loading a %s" %ftype + " with file name: %s" %sam_file)
    if ftype == "bam":
        samfile = pysam.Samfile(sam_file, "rb")
    else:
        samfile = pysam.Samfile(sam_file, "r")
    return samfile
massivePhrap.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def run(self):
        try: 
            proc_name = self.name
            #Open the bams
            nBams = [] # nonTrim Bams
            tBams = [] # trim Bams
            for i in self.args.bam:
                nBams.append(pysam.Samfile(i))
            for i in self.args.pacBam:
                tBams.append(pysam.Samfile(i))

            while True:
                next_task = self.task_queue.get()
                if next_task is None:
                    # Poison pill means shutdown
                    logging.info('Thread %s: Exiting\n' % proc_name)
                    self.task_queue.task_done()
                    break
                try:
                    answer = next_task(nBams, tBams)
                except Exception as e:
                    logging.error("Exception raised in task %s" % (str(e)))
                    self.task_queue.task_done()
                    self.result_queue.put("Failure - UNK - %s" % str(e))
                    logging.info("fail in groupid=%s" % next_task.data.name)
                    continue
                self.task_queue.task_done()
                self.result_queue.put(answer)
            return
        except Exception as e:
            logging.error("Consumer %s Died\nERROR: %s" % (self.name, e))
            return
bampie.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def mapTails(bam, args):
    if bam.endswith('.bam'):
        bam = pysam.Samfile(bam,'rb')
    elif bam.endswith('.sam'):
        bam = pysam.Samfile(bam)
    else:
        logging.error("Cannot open input file! %s" % (bam))
        exit(1)

    logging.info("Extracting tails")
    tailfastq = tempfile.NamedTemporaryFile(suffix=".fastq", delete=False, dir=args.temp)
    tailfastq.close(); tailfastq = tailfastq.name
    r, t, m = extractTails(bam, outFq=tailfastq, minLength=args.minTail)

    logging.info("Parsed %d reads" % (r))
    logging.info("Found %d tails" % (t))
    logging.info("%d reads had double tails" % (m))
    if t == 0:
        logging.info("No tails -- Exiting")
        exit(0)

    tailmap = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, dir=args.temp)
    tailmap.close(); tailmap = tailmap.name

    callBlasr(tailfastq, args.reference, args.params, args.nproc, tailmap)
    bam.close() #reset
    return tailmap


问题


面经


文章

微信
公众号

扫码关注公众号